orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
11 changed files with 628 additions and 45 deletions
Showing only changes of commit c0c2e05eae - Show all commits

View File

@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.mortbay.log.Log; import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.model.WorkData;
@ -143,4 +144,64 @@ public class ActivitiesDecompressor {
Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml work found: " + xmlParserErrorFound); Log.info("Error parsing xml work found: " + xmlParserErrorFound);
} }
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
int counter = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
if (entry.isDirectory() || !filename.contains("works")) {
} else {
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
String xml = buffer.toString();
String[] filenameParts = filename.split("/");
final Text key = new Text(
XMLRecordParser
.retrieveOrcidIdFromActivity(
xml.getBytes(), filenameParts[filenameParts.length - 1]));
final Text value = new Text(xml);
writer.append(key, value);
if ((counter % 100000) == 0) {
Log.info("Current xml works extracted: " + counter);
}
}
}
}
}
Log.info("Activities extraction completed");
Log.info("Total XML works parsed: " + counter);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
} }

View File

@ -0,0 +1,54 @@
package eu.dnetlib.doiboost.orcid;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
public class ExtractXMLActivitiesData extends OrcidDSManager {
private String outputWorksPath;
private String activitiesFileNameTarGz;
public static void main(String[] args) throws IOException, Exception {
ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData();
extractXMLActivitiesData.loadArgs(args);
extractXMLActivitiesData.extractWorks();
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputWorksPath = parser.get("outputWorksPath");
Log.info("Output Author Work Data: " + outputWorksPath);
}
private void extractWorks() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(workingPath)
.concat(outputWorksPath));
ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.doiboost.orcid;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
public class ExtractXMLSummariesData extends OrcidDSManager {
private String outputAuthorsPath;
private String summariesFileNameTarGz;
public static void main(String[] args) throws IOException, Exception {
ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData();
extractXMLSummariesData.loadArgs(args);
extractXMLSummariesData.extractAuthors();
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
Log.info("Summaries File Name: " + summariesFileNameTarGz);
outputAuthorsPath = parser.get("outputAuthorsPath");
Log.info("Output Authors Data: " + outputAuthorsPath);
}
public void extractAuthors() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(workingPath)
.concat(outputAuthorsPath)
.concat("xml_authors.seq"));
SummariesDecompressor.extractXML(conf, tarGzUri, outputPath);
}
}

View File

@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.mortbay.log.Log; import org.mortbay.log.Log;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
@ -160,4 +161,67 @@ public class SummariesDecompressor {
Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml record found: " + xmlParserErrorFound); Log.info("Error parsing xml record found: " + xmlParserErrorFound);
} }
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
int counter = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
CompressionCodec Codec = new GzipCodec();
org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer
.compression(SequenceFile.CompressionType.RECORD, Codec);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class), optCom)) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
if (entry.isDirectory()) {
Log.debug("Directory entry name: " + entry.getName());
} else {
Log.debug("XML record entry name: " + entry.getName());
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
String xml = buffer.toString();
final Text key = new Text(
XMLRecordParser
.retrieveOrcidIdFromSummary(
xml.getBytes(), filename.split("/")[2].substring(0, 19)));
final Text value = new Text(xml);
writer.append(key, value);
}
if ((counter % 100000) == 0) {
Log.info("Current xml records extracted: " + counter);
}
}
}
}
Log.info("Summaries extract completed");
Log.info("Total XML records parsed: " + counter);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
} }

View File

@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import org.mortbay.log.Log;
import com.ximpleware.AutoPilot; import com.ximpleware.AutoPilot;
import com.ximpleware.EOFException; import com.ximpleware.EOFException;
import com.ximpleware.EncodingException; import com.ximpleware.EncodingException;
@ -126,4 +128,33 @@ public class XMLRecordParser {
} }
return workData; return workData;
} }
public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue)
throws VtdException, ParseException {
return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1);
}
public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue)
throws VtdException, ParseException {
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
}
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
String idAttributeName)
throws VtdException, ParseException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(ns, nsUrl);
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, xpath, Arrays.asList(idAttributeName));
if (!recordNodes.isEmpty()) {
return (recordNodes.get(0).getAttributes().get(idAttributeName));
}
Log.info("id not found - default: " + defaultValue);
return defaultValue;
}
} }

View File

@ -1,45 +0,0 @@
<workflow-app name="Orcid Download" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPathOrcid</name>
<description>the working dir base path</description>
</property>
<property>
<name>token</name>
<description>access token</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPathOrcid}/download'/>
<mkdir path='${workingPathOrcid}/download'/>
</fs>
<ok to="DownloadOrcidData"/>
<error to="Kill"/>
</action>
<action name="DownloadOrcidData">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
<arg>-d</arg><arg>${workingPathOrcid}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.csv</arg>
<arg>-o</arg><arg>download/</arg>
<arg>-t</arg><arg>${token}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,232 @@
<workflow-app name="Extract Orcid XML Works From Activities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx2g</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
</configuration>
</global>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/xml/works'/>
<mkdir path='${workingPath}/xml/works'/>
</fs>
<ok to="fork_node"/>
<error to="Kill"/>
</action>
<fork name = "fork_node">
<path start = "ExtractXMLWorkActivities_0"/>
<path start = "ExtractXMLWorkActivities_1"/>
<path start = "ExtractXMLWorkActivities_2"/>
<path start = "ExtractXMLWorkActivities_3"/>
<path start = "ExtractXMLWorkActivities_4"/>
<path start = "ExtractXMLWorkActivities_5"/>
<path start = "ExtractXMLWorkActivities_6"/>
<path start = "ExtractXMLWorkActivities_7"/>
<path start = "ExtractXMLWorkActivities_8"/>
<path start = "ExtractXMLWorkActivities_9"/>
<path start = "ExtractXMLWorkActivities_X"/>
</fork>
<action name="ExtractXMLWorkActivities_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_0.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_1.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_2.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_3.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_4.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_5.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_6.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_7.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_8.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_9.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_X.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,26 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx8g</value>
</property>
</configuration>

View File

@ -0,0 +1,40 @@
<workflow-app name="Extract Orcid XML Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/xml/authors'/>
<mkdir path='${workingPath}/xml/authors'/>
</fs>
<ok to="ExtractXMLAuthorsSummaries"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLAuthorsSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
<arg>-o</arg><arg>xml/authors/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,64 @@
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>token</name>
<description>access token</description>
</property>
<property>
<name>shell_cmd</name>
<value>wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar
</value>
<description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/downloads'/>
<mkdir path='${workingPath}/downloads'/>
</fs>
<ok to="DownloadLambdaFile"/>
<error to="Kill"/>
</action>
<action name="DownloadLambdaFile">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd}</argument>
<capture-output/>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="DownloadOrcidData">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
<arg>-d</arg><arg>${workingPathOrcid}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.csv</arg>
<arg>-o</arg><arg>download/</arg>
<arg>-t</arg><arg>${token}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>