Merge pull request 'Oozie workflow for cleancontext' (#216) from cleancontext into beta

Reviewed-on: #216 Looks good. We need to extend the cleaning workflow parameters to enable the extra step only when it is needed.
2022-04-22 15:46:40 +02:00 · 2022-04-22 15:46:40 +02:00 · 81242538e6
parent a82ec3aaaf 911ce0780a
commit 81242538e6
1 changed files with 142 additions and 1 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -13,6 +13,23 @@
            <name>isLookupUrl</name>
            <description>the address of the lookUp service</description>
        </property>
+        <property>
+            <name>shouldCleanContext</name>
+            <description>true if the context have to be cleaned</description>
+        </property>
+        <property>
+            <name>contextId</name>
+            <value>sobigdata</value>
+            <description>It is the context id that should be removed from the result if the condition is matched.
+            Now it is just sobigdata. In a futere implementation I plan to have the contextId as value in a json
+            where to specify also the constraints that should be verified to remove the context from the result</description>
+        </property>
+        <property>
+            <name>verifyParam</name>
+            <value>gcube </value>
+            <description>It is the constrint to be verified. This time is hardcoded as gcube and it is searched for in
+            the title. If title starts with gcube than the context sobigdata will be removed by the result if present</description>
+        </property>

        <property>
            <name>sparkDriverMemory</name>
@ -275,7 +292,131 @@
        <error to="Kill"/>
    </action>

-    <join name="wait_clean" to="End"/>
+    <join name="wait_clean" to="clean_context"/>
+
+    <decision name="clean_context">
+        <switch>
+            <case to="fork_clean_context">${wf:conf('shouldCleanContext') eq true}</case>
+            <default to="End"/>
+        </switch>
+    </decision>
+
+
+    <fork name="fork_clean_context">
+        <path start="clean_publication_context"/>
+        <path start="clean_dataset_context"/>
+        <path start="clean_otherresearchproduct_context"/>
+        <path start="clean_software_context"/>
+    </fork>
+    <action name="clean_publication_context">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean publications context</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
+            <arg>--contextId</arg><arg>${contextId}</arg>
+            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
+        </spark>
+        <ok to="wait_clean_context"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_dataset_context">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean datasets Context</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
+            <arg>--contextId</arg><arg>${contextId}</arg>
+            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
+        </spark>
+        <ok to="wait_clean_context"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_otherresearchproduct_context">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean otherresearchproducts context</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
+            <arg>--contextId</arg><arg>${contextId}</arg>
+            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
+        </spark>
+        <ok to="wait_clean_context"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_software_context">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean softwares context</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
+            <arg>--contextId</arg><arg>${contextId}</arg>
+            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
+        </spark>
+        <ok to="wait_clean_context"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_clean_context" to="End"/>

    <end name="End"/>
 </workflow-app>