From 55964cbd8183b38c985104e5c8dd96b73a652c4c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 22 Apr 2021 10:18:09 +0200 Subject: [PATCH] [orcid] large oozie workflow cleanup; updated workflow for the orcidnodoi actionset creation --- .../oozie_app/workflow.xml | 42 -- .../oozie_app/config-default.xml | 31 - .../orcid_activities/oozie_app/workflow.xml | 542 ------------------ .../oozie_app/config-default.xml | 22 - .../oozie_app/workflow.xml | 505 ---------------- .../oozie_app/workflow.xml | 99 ---- .../oozie_app/workflow.xml | 232 -------- .../oozie_app/config-default.xml | 26 - .../oozie_app/workflow.xml | 40 -- .../oozie_app/config-default.xml | 26 - .../orcid_summaries/oozie_app/workflow.xml | 68 --- .../orcid_update/oozie_app/workflow.xml | 163 ------ .../orcidnodoi/oozie_app/workflow.xml | 31 +- 13 files changed, 23 insertions(+), 1804 deletions(-) delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml deleted file mode 100644 index becdf0974e..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - - inputPath - /data/orcid_activities_2020/no_doi_dataset - path where retrieve the already generated action set - - - outputPath - /data/orcid_activities_2020/test_import_orcid_no_doi - path where to store the action set - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - ${inputPath}/* - ${outputPath} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml deleted file mode 100644 index 05fe6d014f..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml +++ /dev/null @@ -1,31 +0,0 @@ - - - oozie.action.sharelib.for.java - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx2g - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml deleted file mode 100644 index ea4d33296c..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml +++ /dev/null @@ -1,542 +0,0 @@ - - - - workingPath - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 0 - - - shell_cmd_1 - wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 1 - - - shell_cmd_2 - wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 2 - - - shell_cmd_3 - wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 3 - - - shell_cmd_4 - wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 4 - - - shell_cmd_5 - wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 5 - - - shell_cmd_6 - wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 6 - - - shell_cmd_7 - wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 7 - - - shell_cmd_8 - wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 8 - - - shell_cmd_9 - wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 9 - - - shell_cmd_X - wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file X - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_0.tar.gz - -owno_doi_works/works_0.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_1} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_1.tar.gz - -owno_doi_works/works_1.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_2} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_2.tar.gz - -owno_doi_works/works_2.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_3} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_3.tar.gz - -owno_doi_works/works_3.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_4} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_4.tar.gz - -owno_doi_works/works_4.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_5} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_5.tar.gz - -owno_doi_works/works_5.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_6} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_6.tar.gz - -owno_doi_works/works_6.seq - -oewno_doi_enriched_works/ - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_7} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_7.tar.gz - -owno_doi_works/works_7.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_8} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_8.tar.gz - -owno_doi_works/works_8.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_9} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_9.tar.gz - -owno_doi_works/works_9.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_X} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_X.tar.gz - -owno_doi_works/works_X.seq - -oewno_doi_enriched_works/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml deleted file mode 100644 index 5621415d97..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.action.sharelib.for.java - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx4g - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml deleted file mode 100644 index 1c2ae89ddc..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml +++ /dev/null @@ -1,505 +0,0 @@ - - - - workingPath_activities - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 0 - - - shell_cmd_1 - wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 1 - - - shell_cmd_2 - wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 2 - - - shell_cmd_3 - wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 3 - - - shell_cmd_4 - wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 4 - - - shell_cmd_5 - wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 5 - - - shell_cmd_6 - wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 6 - - - shell_cmd_7 - wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 7 - - - shell_cmd_8 - wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 8 - - - shell_cmd_9 - wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 9 - - - shell_cmd_X - wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file X - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_0.tar.gz - -ooutput/authors_dois_0.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_1} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_1.tar.gz - -ooutput/authors_dois_1.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_2} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_2.tar.gz - -ooutput/authors_dois_2.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_3} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_3.tar.gz - -ooutput/authors_dois_3.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_4} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_4.tar.gz - -ooutput/authors_dois_4.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_5} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_5.tar.gz - -ooutput/authors_dois_5.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_6} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_6.tar.gz - -ooutput/authors_dois_6.seq - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_7} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_7.tar.gz - -ooutput/authors_dois_7.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_8} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_8.tar.gz - -ooutput/authors_dois_8.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_9} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_9.tar.gz - -ooutput/authors_dois_9.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_X} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_X.tar.gz - -ooutput/authors_dois_X.seq - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml deleted file mode 100644 index 133a6f4bd2..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml +++ /dev/null @@ -1,99 +0,0 @@ - - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - spark2MaxExecutors - 20 - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - workingPath - the working dir base path - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - yarn-cluster - cluster - GenDoiAuthorList - eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList - dhp-doiboost-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - - -w${workingPath}/ - -aauthors/authors.seq - -xwxml/works/*.seq - -odoi_author_list/ - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml deleted file mode 100644 index 6f629c7540..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml +++ /dev/null @@ -1,232 +0,0 @@ - - - - workingPath - the working dir base path - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.java - ${oozieActionShareLibForSpark2} - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx2g - - - oozie.use.system.libpath - true - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_0.tar.gz - -owxml/works/xml_works_0.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_1.tar.gz - -owxml/works/xml_works_1.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_2.tar.gz - -owxml/works/xml_works_2.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_3.tar.gz - -owxml/works/xml_works_3.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_4.tar.gz - -owxml/works/xml_works_4.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_5.tar.gz - -owxml/works/xml_works_5.seq - -oew--- - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_6.tar.gz - -owxml/works/xml_works_6.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_7.tar.gz - -owxml/works/xml_works_7.seq - -oew--- - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_8.tar.gz - -owxml/works/xml_works_8.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_9.tar.gz - -owxml/works/xml_works_9.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_X.tar.gz - -owxml/works/xml_works_X.seq - -oew--- - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml deleted file mode 100644 index 191654378a..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx8g - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml deleted file mode 100644 index 68d468ab33..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml +++ /dev/null @@ -1,40 +0,0 @@ - - - - workingPath - the working dir base path - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_summaries.tar.gz - -oxml/authors/ - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml deleted file mode 100644 index 191654378a..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx8g - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml deleted file mode 100644 index 8517f35ee0..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - workingPath - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz - - the shell command that downloads and puts to hdfs orcid summaries - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidDSManager - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_summaries.tar.gz - -oauthors/ - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml deleted file mode 100644 index 135e6a4c86..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml +++ /dev/null @@ -1,163 +0,0 @@ - - - - spark2MaxExecutors - 50 - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - workingPath - the working dir base path - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - yarn-cluster - cluster - UpdateOrcidAuthors - eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors - dhp-doiboost-${projectVersion}.jar - - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - - -w${workingPath}/ - -n${nameNode} - -f- - -o- - -t- - - - - - - - - yarn-cluster - cluster - UpdateOrcidWorks - eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks - dhp-doiboost-${projectVersion}.jar - - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - - -w${workingPath}/ - -n${nameNode} - -f- - -o- - -t- - - - - - - - - - - - - ${workingPath}/orcid_dataset/new_authors/* - ${workingPath}/orcid_dataset/authors - - - - - - - - - - - - ${workingPath}/orcid_dataset/new_works/* - ${workingPath}/orcid_dataset/works - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index 6513ff7e1a..365c4d5b4f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -1,5 +1,15 @@ + + workingPath + /data/orcid_activities_2020 + path where the collection workflow stores the ORCID data + + + outputPath + path where to store the action set + + spark2GenNoDoiDatasetMaxExecutors 40 @@ -35,10 +45,6 @@ spark2EventLogDir spark 2.* event log dir location - - workingPath - the working dir base path - @@ -83,11 +89,20 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - -w${workingPath}/ - -n${nameNode} - -ilast_orcid_dataset - -oewno_doi_dataset + --workingPath${workingPath}/ + --hdfsServerUri${nameNode} + --orcidDataFolderlast_orcid_dataset + --outputEnrichedWorksPathno_doi_dataset + + + + + + + ${workingPath}/no_doi_dataset/* + ${outputPath} +