[orcidenrichment] refactoring

2024-11-06 14:15:34 +01:00 · 2024-11-06 14:15:34 +01:00 · f9531e0406
parent 1b4bbb2691
commit f9531e0406
2 changed files with 26 additions and 244 deletions
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json
@ -1,21 +1,32 @@
 [
  {
    "paramName":"s",
-    "paramLongName":"sourcePath",
+    "paramLongName":"graphPath",
    "paramDescription": "the path of the sequencial file to read",
    "paramRequired": true
  },

  {
    "paramName": "out",
-    "paramLongName": "outputPath",
+    "paramLongName": "targetPath",
    "paramDescription": "the path used to store temporary output files",
    "paramRequired": true
-  },
+  }, {
+  "paramName": "o",
+  "paramLongName": "orcidPath",
+  "paramDescription": "the path used to store temporary output files",
+  "paramRequired": true
+}, {
+  "paramName": "w",
+  "paramLongName": "workingDir",
+  "paramDescription": "the path used to store temporary output files",
+  "paramRequired": true
+},
  {
-    "paramName": "ssm",
-    "paramLongName": "isSparkSessionManaged",
-    "paramDescription": "true if the spark session is managed, false otherwise",
+    "paramName": "m",
+    "paramLongName": "matchingSource",
+    "paramDescription": "the path used to store temporary output files",
    "paramRequired": false
  }
+
 ]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -92,21 +92,14 @@
        <error to="Kill"/>
    </action>

-    <join name="copy_wait" to="fork_prepare_assoc_step1"/>
+    <join name="copy_wait" to="exec_propagation"/>

-    <fork name="fork_prepare_assoc_step1">
-        <path start="join_prepare_publication"/>
-        <path start="join_prepare_dataset"/>
-        <path start="join_prepare_otherresearchproduct"/>
-        <path start="join_prepare_software"/>
-    </fork>
-
-    <action name="join_prepare_publication">
+    <action name="exec_propagation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>ORCIDPropagation-PreparePhase1-Publications</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
+            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkPropagateOrcidAuthor</class>
            <jar>dhp-enrichment-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -119,239 +112,17 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
-            <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
+            <arg>--graphPath</arg><arg>${sourcePath}/</arg>
+            <arg>--orcidPath</arg><arg>${sourcePath}/</arg>
+            <arg>--workingDir</arg><arg>${workingDir}/</arg>
+            <arg>--targetPath</arg><arg>${outputPath}/</arg>
+            <arg>--matchingSource</arg><arg>graph</arg>
        </spark>
-        <ok to="wait"/>
+        <ok to="End"/>
        <error to="Kill"/>
    </action>

-    <action name="join_prepare_dataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-PreparePhase1-Dataset</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
-            <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
-        </spark>
-        <ok to="wait"/>
-        <error to="Kill"/>
-    </action>

-    <action name="join_prepare_otherresearchproduct">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-PreparePhase1-ORP</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
-            <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
-        </spark>
-        <ok to="wait"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="join_prepare_software">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-PreparePhase1-Software</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
-            <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
-        </spark>
-        <ok to="wait"/>
-        <error to="Kill"/>
-    </action>
-
-    <join name="wait" to="prepare_assoc_step2"/>
-
-    <action name="prepare_assoc_step2">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-PreparePhase2</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
-        </spark>
-        <ok to="fork-join-exec-propagation"/>
-        <error to="Kill"/>
-    </action>
-
-    <fork name="fork-join-exec-propagation">
-        <path start="join_propagate_publication"/>
-        <path start="join_propagate_dataset"/>
-        <path start="join_propagate_otherresearchproduct"/>
-        <path start="join_propagate_software"/>
-    </fork>
-
-    <action name="join_propagate_publication">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-Publication</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=15000
-            </spark-opts>
-            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
-            <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
-        </spark>
-        <ok to="wait2"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="join_propagate_dataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-Dataset</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
-            </spark-opts>
-            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
-            <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
-        </spark>
-        <ok to="wait2"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="join_propagate_otherresearchproduct">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-ORP</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
-            </spark-opts>
-            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
-            <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
-        </spark>
-        <ok to="wait2"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="join_propagate_software">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>ORCIDPropagation-Software</name>
-            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
-            <jar>dhp-enrichment-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=4000
-            </spark-opts>
-            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
-            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
-            <arg>--outputPath</arg><arg>${outputPath}/software</arg>
-        </spark>
-        <ok to="wait2"/>
-        <error to="Kill"/>
-    </action>
-    
-    <join name="wait2" to="End"/>
    
    <end name="End"/>