From fa720c1da44c78ea08784a32c44b71a3899f349f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 13 Jul 2021 16:59:30 +0200 Subject: [PATCH] [doiboost] added workflow for the ActionSet update dedicated to production --- .../process/oozie_app/config-default.xml | 42 +++ .../doiboost/process/oozie_app/workflow.xml | 259 ++++++++++++++++++ 2 files changed, 301 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/config-default.xml new file mode 100644 index 000000000..508202e30 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/config-default.xml @@ -0,0 +1,42 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + "com.cloudera.spark.lineage.NavigatorAppListener" + + + spark2SqlQueryExecutionListeners + "com.cloudera.spark.lineage.NavigatorQueryListener" + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml new file mode 100644 index 000000000..cf1cd97f7 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml @@ -0,0 +1,259 @@ + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorIntersectionMemory + memory for individual executor + + + + sparkExecutorCores + number of cores used by single executor + + + + workingPath + the working Path + + + + hostedByMapPath + the hostedByMap Path + + + outputPath + the Path of the sequence file action set + + + + + + inputPathCrossref + the Crossref input path + + + + + inputPathMAG + the MAG working path + + + + + + inputPathUnpayWall + the UnpayWall working path + + + + + workingPathOrcid + the ORCID working path + + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + ${wf:conf('resumeFrom') eq 'PreprocessMag'} + ${wf:conf('resumeFrom') eq 'PreprocessUW'} + ${wf:conf('resumeFrom') eq 'ProcessORCID'} + ${wf:conf('resumeFrom') eq 'CreateDOIBoost'} + ${wf:conf('resumeFrom') eq 'GenerateActionSet'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn-cluster + cluster + ConvertCrossrefToOAF + eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${inputPathCrossref}/crossref_ds + --targetPath${workingPath} + --masteryarn-cluster + + + + + + + + yarn-cluster + cluster + Convert Mag to OAF Dataset + eu.dnetlib.doiboost.mag.SparkProcessMAG + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorIntersectionMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${inputPathMAG}/dataset + --workingPath${inputPathMAG}/process_p + --targetPath${workingPath} + --masteryarn-cluster + + + + + + + + + + yarn-cluster + cluster + Convert UnpayWall to Dataset + eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${inputPathUnpayWall}/uw_extracted + --targetPath${workingPath}/uwPublication + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Convert ORCID to Dataset + eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --workingPath${workingPathOrcid} + --targetPath${workingPath}/orcidPublication + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Create DOIBoost Infospace + eu.dnetlib.doiboost.SparkGenerateDoiBoost + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorIntersectionMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --hostedByMapPath${hostedByMapPath} + --affiliationPath${inputPathMAG}/dataset/Affiliations + --paperAffiliationPath${inputPathMAG}/dataset/PaperAuthorAffiliations + --workingPath${workingPath} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Generate DOIBoost ActionSet + eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --dbPublicationPath${workingPath}/doiBoostPublicationFiltered + --dbDatasetPath${workingPath}/crossrefDataset + --crossRefRelation${workingPath}/crossrefRelation + --dbaffiliationRelationPath${workingPath}/doiBoostPublicationAffiliation + --dbOrganizationPath${workingPath}/doiBoostOrganization + --targetPath${workingPath}/actionDataSet + --sFilePath${outputPath} + --masteryarn-cluster + + + + + + + \ No newline at end of file