Hosted By Map - extention of the workflow to consider also the application of the map to publications and datasources

2021-08-04 10:18:11 +02:00 · 2021-08-04 10:18:11 +02:00 · e94ae0b1de
parent 67ba4c40e0
commit e94ae0b1de
1 changed files with 99 additions and 7 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/workflow.xml
@ -9,6 +9,10 @@
            <name>outputPath</name>
            <description>the output path</description>
        </property>
+        <property>
+            <name>hostedByMapPath</name>
+            <description>the output path</description>
+        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -65,23 +69,30 @@
        </configuration>
    </global>

-    <start to="reset_outputpath"/>
+    <start to="resume_from"/>
+
+    <decision name="resume_from">
+        <switch>
+            <case to="remove_hbmpath">${wf:conf('resumeFrom') eq 'ProduceHBM'}</case>
+            <default to="prepareInfo"/>
+        </switch>
+    </decision>
+

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

-    <action name="reset_outputpath">
+    <action name="remove_hbmpath">
        <fs>
-            <delete path="${outputPath}"/>
-            <mkdir path="${outputPath}"/>
+            <delete path="${hostedByMapPath}"/>
+<!--            <mkdir path="${hostedByMapPath}"/>-->
        </fs>
        <ok to="fork_downloads_csv"/>
        <error to="Kill"/>
    </action>


-
    <fork name="fork_downloads_csv">
        <path start="download_gold"/>
        <path start="download_doaj"/>
@ -134,13 +145,94 @@
            </spark-opts>
            <arg>--datasourcePath</arg><arg>${sourcePath}/datasource</arg>
            <arg>--workingPath</arg><arg>${workingDir}</arg>
-            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--outputPath</arg><arg>${hostedByMapPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
-        <ok to="End"/>
+        <ok to="prepareInfo"/>
        <error to="Kill"/>
    </action>

+    <action name="prepareInfo">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <name>Prepare info to apply the hbm</name>
+            <class>eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
+            <arg>--graphPath</arg><arg>${sourcePath}</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="fork_apply"/>
+        <error to="Kill"/>
+    </action>
+
+    <fork name="fork_apply">
+        <path start="apply_result"/>
+        <path start="apply_datasource"/>
+    </fork>
+
+    <action name="apply_result">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <name>Apply hbm to result</name>
+            <class>eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
+            <arg>--graphPath</arg><arg>${sourcePath}</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="join_apply"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="apply_datasource">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <name>Apply hbm to datasource</name>
+            <class>eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToDatasource</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--outputPath</arg><arg>${outputPath}/datasource</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
+            <arg>--graphPath</arg><arg>${sourcePath}</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="join_apply"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="join_apply" to="End"/>


    <end name="End"/>