From 8f61063201720534116a092b375131415ac149e7 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 10 Jan 2024 19:42:22 +0100 Subject: [PATCH] Added workflow --- .../collection/orcid/OrcidGetUpdatesFile.java | 2 +- .../orcid/update/oozie_app/config-default.xml | 23 +++++++ .../orcid/update/oozie_app/workflow.xml | 69 +++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidGetUpdatesFile.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidGetUpdatesFile.java index 3bfe72328..39da749e2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidGetUpdatesFile.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidGetUpdatesFile.java @@ -41,7 +41,7 @@ public class OrcidGetUpdatesFile { final String targetPath = parser.get("targetPath"); log.info("got variable targetPath: {}", targetPath); - //http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar + final String apiURL = parser.get("apiURL"); log.info("got variable apiURL: {}", apiURL); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/workflow.xml new file mode 100644 index 000000000..74a88bae6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/update/oozie_app/workflow.xml @@ -0,0 +1,69 @@ + + + + targetPath + the path to store the original ORCID dump + + + apiURL + The URL of the update CSV list + http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar + + + accessToken + The access tocken + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.collection.orcid.OrcidGetUpdatesFile + --namenode${nameNode} + --targetPath${targetPath} + --apiURL${apiURL} + --accessToken${accessToken} + + + + + + + + yarn + cluster + Generate ORCID Tables + eu.dnetlib.dhp.collection.orcid.SparkGenerateORCIDTable + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=2g + --conf spark.sql.shuffle.partitions=3000 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${targetPath}/extracted + --targetPath${targetPath}/tables + --masteryarn + + + + + + \ No newline at end of file