From 1edcd535810fce8a06f48eea9ceeef33284396ef Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 28 Apr 2020 20:25:09 +0200 Subject: [PATCH] added shell actions to download all 11 activities files from ORCID --- .../orcid/ActivitiesDecompressor.java | 3 +- .../oozie_app/workflow.xml | 460 +++++++++++++++++- 2 files changed, 454 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index da44f5795..65d10c833 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -23,6 +23,7 @@ import org.mortbay.log.Log; public class ActivitiesDecompressor { private static final int MAX_XML_WORKS_PARSED = -1; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) throws Exception { @@ -127,7 +128,7 @@ public class ActivitiesDecompressor { Log.warn(e); } - if ((counter % 100000) == 0) { + if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { Log.info("Current xml works parsed: " + counter); } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml index 3dabb765b..1c2ae89dd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml @@ -1,9 +1,75 @@ - + workingPath_activities the working dir base path + + shell_cmd_0 + wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 0 + + + shell_cmd_1 + wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 1 + + + shell_cmd_2 + wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 2 + + + shell_cmd_3 + wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 3 + + + shell_cmd_4 + wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 4 + + + shell_cmd_5 + wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 5 + + + shell_cmd_6 + wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 6 + + + shell_cmd_7 + wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 7 + + + shell_cmd_8 + wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 8 + + + shell_cmd_9 + wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 9 + + + shell_cmd_X + wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file X + @@ -18,16 +84,47 @@ - + - - - + + + + + + + + + + + + - + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + ${jobTracker} ${nameNode} @@ -41,7 +138,29 @@ - + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_1} + + + + + + + ${jobTracker} ${nameNode} @@ -55,7 +174,332 @@ - + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_2} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_2.tar.gz + -ooutput/authors_dois_2.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_3} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_3.tar.gz + -ooutput/authors_dois_3.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_4} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_4.tar.gz + -ooutput/authors_dois_4.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_5} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_5.tar.gz + -ooutput/authors_dois_5.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_6} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_6.tar.gz + -ooutput/authors_dois_6.seq + + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_7} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_7.tar.gz + -ooutput/authors_dois_7.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_8} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_8.tar.gz + -ooutput/authors_dois_8.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_9} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_9.tar.gz + -ooutput/authors_dois_9.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_X} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_X.tar.gz + -ooutput/authors_dois_X.seq + + + + + + \ No newline at end of file