From c0c2e05eae56c3dad6e111177d88f1959b654d2e Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 17 Nov 2020 18:23:12 +0100 Subject: [PATCH] added wf to extracting authors and works xml data from orcid dump to hdfs; added wf to download the lamda file (containing last orcid update informations) from orcid to hdfs --- .../orcid/ActivitiesDecompressor.java | 61 +++++ .../orcid/ExtractXMLActivitiesData.java | 54 ++++ .../orcid/ExtractXMLSummariesData.java | 56 +++++ .../doiboost/orcid/SummariesDecompressor.java | 64 +++++ .../doiboost/orcid/xml/XMLRecordParser.java | 31 +++ .../orcid_download/oozie_app/workflow.xml | 45 ---- .../oozie_app/workflow.xml | 232 ++++++++++++++++++ .../oozie_app/config-default.xml | 26 ++ .../oozie_app/workflow.xml | 40 +++ .../oozie_app/config-default.xml | 0 .../oozie_app/workflow.xml | 64 +++++ 11 files changed, 628 insertions(+), 45 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{orcid_download => orcid_updates_download}/oozie_app/config-default.xml (100%) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 02d2b267b..420c363ec 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.io.compress.GzipCodec; import org.mortbay.log.Log; import eu.dnetlib.doiboost.orcid.model.WorkData; @@ -143,4 +144,64 @@ public class ActivitiesDecompressor { Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error parsing xml work found: " + xmlParserErrorFound); } + + public static void extractXML(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + int counter = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + if (entry.isDirectory() || !filename.contains("works")) { + } else { + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + String xml = buffer.toString(); + String[] filenameParts = filename.split("/"); + final Text key = new Text( + XMLRecordParser + .retrieveOrcidIdFromActivity( + xml.getBytes(), filenameParts[filenameParts.length - 1])); + final Text value = new Text(xml); + writer.append(key, value); + if ((counter % 100000) == 0) { + Log.info("Current xml works extracted: " + counter); + } + } + } + } + } + Log.info("Activities extraction completed"); + Log.info("Total XML works parsed: " + counter); + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } + } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java new file mode 100644 index 000000000..c834efa20 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.doiboost.orcid; + +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork; + +public class ExtractXMLActivitiesData extends OrcidDSManager { + private String outputWorksPath; + private String activitiesFileNameTarGz; + + public static void main(String[] args) throws IOException, Exception { + ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData(); + extractXMLActivitiesData.loadArgs(args); + extractXMLActivitiesData.extractWorks(); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenOrcidAuthorWork.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); + activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); + Log.info("Activities File Name: " + activitiesFileNameTarGz); + outputWorksPath = parser.get("outputWorksPath"); + Log.info("Output Author Work Data: " + outputWorksPath); + } + + private void extractWorks() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path( + hdfsServerUri + .concat(workingPath) + .concat(outputWorksPath)); + ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java new file mode 100644 index 000000000..843889108 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.doiboost.orcid; + +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork; + +public class ExtractXMLSummariesData extends OrcidDSManager { + + private String outputAuthorsPath; + private String summariesFileNameTarGz; + + public static void main(String[] args) throws IOException, Exception { + ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData(); + extractXMLSummariesData.loadArgs(args); + extractXMLSummariesData.extractAuthors(); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenOrcidAuthorWork.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); + summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); + Log.info("Summaries File Name: " + summariesFileNameTarGz); + outputAuthorsPath = parser.get("outputAuthorsPath"); + Log.info("Output Authors Data: " + outputAuthorsPath); + } + + public void extractAuthors() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz); + Path outputPath = new Path( + hdfsServerUri + .concat(workingPath) + .concat(outputAuthorsPath) + .concat("xml_authors.seq")); + SummariesDecompressor.extractXML(conf, tarGzUri, outputPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index d1b2a1d73..c16899977 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.io.compress.GzipCodec; import org.mortbay.log.Log; import eu.dnetlib.dhp.schema.orcid.AuthorData; @@ -160,4 +161,67 @@ public class SummariesDecompressor { Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error parsing xml record found: " + xmlParserErrorFound); } + + public static void extractXML(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + int counter = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + CompressionCodec Codec = new GzipCodec(); + org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer + .compression(SequenceFile.CompressionType.RECORD, Codec); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), optCom)) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + if (entry.isDirectory()) { + Log.debug("Directory entry name: " + entry.getName()); + } else { + Log.debug("XML record entry name: " + entry.getName()); + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + String xml = buffer.toString(); + final Text key = new Text( + XMLRecordParser + .retrieveOrcidIdFromSummary( + xml.getBytes(), filename.split("/")[2].substring(0, 19))); + final Text value = new Text(xml); + writer.append(key, value); + } + if ((counter % 100000) == 0) { + Log.info("Current xml records extracted: " + counter); + } + } + } + } + Log.info("Summaries extract completed"); + Log.info("Total XML records parsed: " + counter); + + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index a807cf132..cc9abb621 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml; import java.util.Arrays; import java.util.List; +import org.mortbay.log.Log; + import com.ximpleware.AutoPilot; import com.ximpleware.EOFException; import com.ximpleware.EncodingException; @@ -126,4 +128,33 @@ public class XMLRecordParser { } return workData; } + + public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue) + throws VtdException, ParseException { + return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1); + } + + public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue) + throws VtdException, ParseException { + return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code"); + } + + private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath, + String idAttributeName) + throws VtdException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(ns, nsUrl); + List recordNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, xpath, Arrays.asList(idAttributeName)); + if (!recordNodes.isEmpty()) { + return (recordNodes.get(0).getAttributes().get(idAttributeName)); + } + Log.info("id not found - default: " + defaultValue); + return defaultValue; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml deleted file mode 100644 index 1f9adeb4d..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - workingPathOrcid - the working dir base path - - - token - access token - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidDownloader - -d${workingPathOrcid}/ - -n${nameNode} - -flast_modified.csv - -odownload/ - -t${token} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml new file mode 100644 index 000000000..6f629c754 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml @@ -0,0 +1,232 @@ + + + + workingPath + the working dir base path + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.java + ${oozieActionShareLibForSpark2} + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx2g + + + oozie.use.system.libpath + true + + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_0.tar.gz + -owxml/works/xml_works_0.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_1.tar.gz + -owxml/works/xml_works_1.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_2.tar.gz + -owxml/works/xml_works_2.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_3.tar.gz + -owxml/works/xml_works_3.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_4.tar.gz + -owxml/works/xml_works_4.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_5.tar.gz + -owxml/works/xml_works_5.seq + -oew--- + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_6.tar.gz + -owxml/works/xml_works_6.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_7.tar.gz + -owxml/works/xml_works_7.seq + -oew--- + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_8.tar.gz + -owxml/works/xml_works_8.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_9.tar.gz + -owxml/works/xml_works_9.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_X.tar.gz + -owxml/works/xml_works_X.seq + -oew--- + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml new file mode 100644 index 000000000..191654378 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml @@ -0,0 +1,26 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx8g + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml new file mode 100644 index 000000000..68d468ab3 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml @@ -0,0 +1,40 @@ + + + + workingPath + the working dir base path + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_summaries.tar.gz + -oxml/authors/ + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/config-default.xml rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml new file mode 100644 index 000000000..a3daab116 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml @@ -0,0 +1,64 @@ + + + + workingPath + the working dir base path + + + token + access token + + + shell_cmd + wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar + + the shell command that downloads the lambda file from orcid containing last orcid update informations + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidDownloader + -d${workingPathOrcid}/ + -n${nameNode} + -flast_modified.csv + -odownload/ + -t${token} + + + + + + + \ No newline at end of file