[EOSC DUMP ] modified workflow to add the indicators taken from the action set

This commit is contained in:
Miriam Baglioni 2022-11-07 18:04:27 +01:00
parent 5742f63f39
commit ff366dd5b4
1 changed files with 104 additions and 4 deletions

View File

@ -153,13 +153,38 @@
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/publicationextendedaffiliation</arg>
</spark>
<ok to="extend_publication_with_indicators"/>
<error to="Kill"/>
</action>
<action name="extend_publication_with_indicators">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump Publication with indicators </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendWithUsageCounts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--actionSetPath</arg><arg>${actionSetPath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/publicationextendedaffiliation</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/publicationextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
<action name="dump_eosc_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -203,13 +228,38 @@
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/datasetextendedaffiliation</arg>
</spark>
<ok to="extend_dataset_with_indicators"/>
<error to="Kill"/>
</action>
<action name="extend_dataset_with_indicators">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump Dataset with indicators </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendWithUsageCounts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--actionSetPath</arg><arg>${actionSetPath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/datasetextendedaffiliation</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/datasetextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
<action name="dump_eosc_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -254,12 +304,37 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproductextendedaffiliation</arg>
</spark>
<ok to="extend_orp_with_indicators"/>
<error to="Kill"/>
</action>
<action name="extend_orp_with_indicators">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump ORP with indicators </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendWithUsageCounts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--actionSetPath</arg><arg>${actionSetPath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/otherresearchproductextendedaffiliation</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproductextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
<action name="dump_eosc_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -304,12 +379,37 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/softwareextendedaffiliation</arg>
</spark>
<ok to="extend_software_with_indicators"/>
<error to="Kill"/>
</action>
<action name="extend_software_with_indicators">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump ORP with indicators </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendWithUsageCounts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--actionSetPath</arg><arg>${actionSetPath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/softwareextendedaffiliation</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/softwareextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
<join name="wait_eosc_dump" to="prepareResultProject"/>
<action name="prepareResultProject">