From 902b0db85aa37ffe195554279259fea4e914f53a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 6 Nov 2020 17:19:28 +0100 Subject: [PATCH] try to make workflow and sub-workflow for making report and actual orcid cleaning --- .../graph/clean_orcid/oozie_app/workflow.xml | 21 +- .../wf/clean/oozie_app/workflow.xml | 232 ++++++++++++++++++ .../oa/graph/clean_orcid/wf/main/import.txt | 3 + .../wf/main/oozie_app/config-default.xml | 18 ++ .../wf/main/oozie_app/workflow.xml | 231 +++++++++++++++++ .../wf/report/oozie_app/workflow.xml | 189 ++++++++++++++ 6 files changed, 684 insertions(+), 10 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/import.txt create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml index b2e4871078..88b3b7fe03 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml @@ -81,7 +81,7 @@ yarn cluster - prepare publication + orcid prepare publication eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob dhp-graph-mapper-${projectVersion}.jar @@ -107,7 +107,7 @@ yarn cluster - prepare publication + orcid prepare dataset eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob dhp-graph-mapper-${projectVersion}.jar @@ -133,7 +133,7 @@ yarn cluster - prepare publication + orcid prepare software eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob dhp-graph-mapper-${projectVersion}.jar @@ -159,7 +159,7 @@ yarn cluster - prepare publication + orcid prepare orp eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob dhp-graph-mapper-${projectVersion}.jar @@ -187,6 +187,7 @@ ${wf:conf('clean') eq false} ${wf:conf('clean') eq true} + @@ -242,7 +243,7 @@ --conf spark.sql.shuffle.partitions=7680 --preparedInfoPath${workingDir}/dataset - --outputPath${utputPath}/dataset + --outputPath${outputPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset --inputPath${inputPath}/dataset --orcidInputPath${orcidInputPath} @@ -269,7 +270,7 @@ --conf spark.sql.shuffle.partitions=7680 --preparedInfoPath${workingDir}/otherresearchproduct - --outputPath${graphOutputPath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --inputPath${inputPath}/otherresearchproduct --orcidInputPath${orcidInputPath} @@ -353,9 +354,9 @@ - - - + + + @@ -430,7 +431,7 @@ --conf spark.sql.shuffle.partitions=7680 --preparedInfoPath${workingDir}/otherresearchproduct - --outputPath${graphOutputPath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --inputPath${inputPath}/otherresearchproduct --orcidInputPath${orcidInputPath} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app/workflow.xml new file mode 100644 index 0000000000..8a258a0db7 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app/workflow.xml @@ -0,0 +1,232 @@ + + + + + inputPath + the input path to read graph content + + + outputPath + the target path to store cleaned graph + + + orcidInputPath + the input path where to find the orcid authoritative information + + + inputPreparedInputPath + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + ${nameNode}/${inputPath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${nameNode}/${inputPath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + ${nameNode}/${inputPath}/project + ${nameNode}/${outputPath}/project + + + + + + + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + + + + + + + + yarn + cluster + Clean ORCID for Publications + eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${inputPath}/publication + --outputPath${outputPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --preparedInfoPath${inputPreparedInfoPath}/publication + --orcidInputPath${orcidInputPath} + + + + + + + + yarn + cluster + Clean ORCID for Datasets + eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --preparedInfoPath${inputPreparedInfoPath}/dataset + --outputPath${outputPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --inputPath${inputPath}/dataset + --orcidInputPath${orcidInputPath} + + + + + + + + yarn + cluster + Clean ORCID for ORP + eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --preparedInfoPath${inputPreparedInfoPath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --inputPath${inputPath}/otherresearchproduct + --orcidInputPath${orcidInputPath} + + + + + + + + yarn + cluster + Clean ORCID for Software + eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --preparedInfoPath${inputPreparedInfoPath}/software + --outputPath${outputPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --inputPath${inputPath}/software + --orcidInputPath${orcidInputPath} + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/import.txt b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/import.txt new file mode 100644 index 0000000000..c8671c4c1c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/import.txt @@ -0,0 +1,3 @@ +## This is a classpath-based import file (this header is required) +make_report classpath eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app +clean_orcid classpath eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml new file mode 100644 index 0000000000..a144b1bd94 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml @@ -0,0 +1,231 @@ + + + + + inputPath + the input path to read graph content + + + outputPath + the target path to store cleaned graph + + + orcidInputPath + the input path where to find the orcid authoritative information + + + clean + false + determines if the orcid should be cleaned in the graph (true) or the report should be produced (false) + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + yarn + cluster + orcid prepare publication + eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${inputPath}/publication + --outputPath${workingDir}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + + + + + + + + yarn + cluster + orcid prepare dataset + eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${inputPath}/dataset + --outputPath${workingDir}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + + + + + + + + yarn + cluster + orcid prepare software + eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${inputPath}/software + --outputPath${workingDir}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + + + + + + + + yarn + cluster + orcid prepare orp + eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${inputPath}/otherresearchproduct + --outputPath${workingDir}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + + + + + + + + + + ${wf:conf('clean') eq false} + ${wf:conf('clean') eq true} + + + + + + + + ${wf:appPath()}/make_report + + + + inputPreparedInfoPath + ${workingDir} + + + orcidInputPath + ${orcidInputPath} + + + + + + + + + + ${wf:appPath()}/clean_orcid + + + + inputPreparedInfoPath + ${workingDir} + + + orcidInputPath + ${orcidInputPath} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app/workflow.xml new file mode 100644 index 0000000000..1d8630a96a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app/workflow.xml @@ -0,0 +1,189 @@ + + + + + inputPath + the input path to read graph content + + + outputPath + the target path to store cleaned graph + + + orcidInputPath + the input path where to find the orcid authoritative information + + + inputPreparedInfoPath + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + yarn + cluster + Report ORCID on Publication + eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${inputPath}/publication + --outputPath${outputPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --preparedInfoPath${inputPreparedInfoPath}/publication + --orcidInputPath${orcidInputPath} + + + + + + + + yarn + cluster + Report ORCID on Dataset + eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --preparedInfoPath${inputPreparedInfoPath}/dataset + --outputPath${outputPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --inputPath${inputPath}/dataset + --orcidInputPath${orcidInputPath} + + + + + + + + yarn + cluster + Report ORCID on ORP + eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --preparedInfoPath${inputPreparedInfoPath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --inputPath${inputPath}/otherresearchproduct + --orcidInputPath${orcidInputPath} + + + + + + + + yarn + cluster + Report ORCID on Softwar + eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --preparedInfoPath${inputPreparedInfoPath}/software + --outputPath${outputPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --inputPath${inputPath}/software + --orcidInputPath${orcidInputPath} + + + + + + + + + \ No newline at end of file