[Clean Context] added logic to cleaning workflow to accomodate also context cleaning

2022-04-21 13:02:14 +02:00 · 2022-04-21 13:02:14 +02:00 · 5b7d9e741c
parent ccba1a3db1
commit 5b7d9e741c
1 changed files with 108 additions and 0 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -295,7 +295,115 @@
        <path start="clean_otherresearchproduct_context"/>
        <path start="clean_software_context"/>
    </fork>
    <action name="clean_publication_context">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean publications context</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--workingPath</arg><arg>${workingDir}/working</arg>
            <arg>--contextId</arg><arg>${contextId}</arg>
            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
        </spark>
        <ok to="wait_clean_context"/>
        <error to="Kill"/>
    </action>
    <action name="clean_dataset_context">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean datasets Context</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--workingPath</arg><arg>${workingDir}/working</arg>
            <arg>--contextId</arg><arg>${contextId}</arg>
            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
        </spark>
        <ok to="wait_clean_context"/>
        <error to="Kill"/>
    </action>
    <action name="clean_otherresearchproduct_context">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean otherresearchproducts context</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--workingPath</arg><arg>${workingDir}/working</arg>
            <arg>--contextId</arg><arg>${contextId}</arg>
            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
        </spark>
        <ok to="wait_clean_context"/>
        <error to="Kill"/>
    </action>
    <action name="clean_software_context">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean softwares context</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--workingPath</arg><arg>${workingDir}/working</arg>
            <arg>--contextId</arg><arg>${contextId}</arg>
            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
        </spark>
        <ok to="wait_clean_context"/>
        <error to="Kill"/>
    </action>
    <join name="wait_clean_context" to="End"/>
    <end name="End"/>
 </workflow-app>