forked from D-Net/dnet-hadoop
added wf to extracting authors and works xml data from orcid dump to hdfs; added wf to download the lamda file (containing last orcid update informations) from orcid to hdfs
This commit is contained in:
parent
c796adae24
commit
c0c2e05eae
|
@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile;
|
|||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||
|
@ -143,4 +144,64 @@ public class ActivitiesDecompressor {
|
|||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||
}
|
||||
|
||||
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
String uri = inputUri;
|
||||
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||
Path inputPath = new Path(uri);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodec(inputPath);
|
||||
if (codec == null) {
|
||||
System.err.println("No codec found for " + uri);
|
||||
System.exit(1);
|
||||
}
|
||||
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
int counter = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(outputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
if (entry.isDirectory() || !filename.contains("works")) {
|
||||
} else {
|
||||
counter++;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
String xml = buffer.toString();
|
||||
String[] filenameParts = filename.split("/");
|
||||
final Text key = new Text(
|
||||
XMLRecordParser
|
||||
.retrieveOrcidIdFromActivity(
|
||||
xml.getBytes(), filenameParts[filenameParts.length - 1]));
|
||||
final Text value = new Text(xml);
|
||||
writer.append(key, value);
|
||||
if ((counter % 100000) == 0) {
|
||||
Log.info("Current xml works extracted: " + counter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.info("Activities extraction completed");
|
||||
Log.info("Total XML works parsed: " + counter);
|
||||
} finally {
|
||||
Log.debug("Closing gzip stream");
|
||||
IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
|
||||
|
||||
public class ExtractXMLActivitiesData extends OrcidDSManager {
|
||||
private String outputWorksPath;
|
||||
private String activitiesFileNameTarGz;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData();
|
||||
extractXMLActivitiesData.loadArgs(args);
|
||||
extractXMLActivitiesData.extractWorks();
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenOrcidAuthorWork.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Working Path: " + workingPath);
|
||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||
outputWorksPath = parser.get("outputWorksPath");
|
||||
Log.info("Output Author Work Data: " + outputWorksPath);
|
||||
}
|
||||
|
||||
private void extractWorks() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||
Path outputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(workingPath)
|
||||
.concat(outputWorksPath));
|
||||
ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
|
||||
|
||||
public class ExtractXMLSummariesData extends OrcidDSManager {
|
||||
|
||||
private String outputAuthorsPath;
|
||||
private String summariesFileNameTarGz;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData();
|
||||
extractXMLSummariesData.loadArgs(args);
|
||||
extractXMLSummariesData.extractAuthors();
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenOrcidAuthorWork.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Working Path: " + workingPath);
|
||||
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||
Log.info("Output Authors Data: " + outputAuthorsPath);
|
||||
}
|
||||
|
||||
public void extractAuthors() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
|
||||
Path outputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(workingPath)
|
||||
.concat(outputAuthorsPath)
|
||||
.concat("xml_authors.seq"));
|
||||
SummariesDecompressor.extractXML(conf, tarGzUri, outputPath);
|
||||
}
|
||||
}
|
|
@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile;
|
|||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||
|
@ -160,4 +161,67 @@ public class SummariesDecompressor {
|
|||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
|
||||
}
|
||||
|
||||
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
String uri = inputUri;
|
||||
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||
Path inputPath = new Path(uri);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodec(inputPath);
|
||||
if (codec == null) {
|
||||
System.err.println("No codec found for " + uri);
|
||||
System.exit(1);
|
||||
}
|
||||
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
int counter = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
CompressionCodec Codec = new GzipCodec();
|
||||
org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer
|
||||
.compression(SequenceFile.CompressionType.RECORD, Codec);
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(outputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class), optCom)) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
if (entry.isDirectory()) {
|
||||
Log.debug("Directory entry name: " + entry.getName());
|
||||
} else {
|
||||
Log.debug("XML record entry name: " + entry.getName());
|
||||
counter++;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
String xml = buffer.toString();
|
||||
final Text key = new Text(
|
||||
XMLRecordParser
|
||||
.retrieveOrcidIdFromSummary(
|
||||
xml.getBytes(), filename.split("/")[2].substring(0, 19)));
|
||||
final Text value = new Text(xml);
|
||||
writer.append(key, value);
|
||||
}
|
||||
if ((counter % 100000) == 0) {
|
||||
Log.info("Current xml records extracted: " + counter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.info("Summaries extract completed");
|
||||
Log.info("Total XML records parsed: " + counter);
|
||||
|
||||
} finally {
|
||||
Log.debug("Closing gzip stream");
|
||||
IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.EOFException;
|
||||
import com.ximpleware.EncodingException;
|
||||
|
@ -126,4 +128,33 @@ public class XMLRecordParser {
|
|||
}
|
||||
return workData;
|
||||
}
|
||||
|
||||
public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue)
|
||||
throws VtdException, ParseException {
|
||||
return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1);
|
||||
}
|
||||
|
||||
public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue)
|
||||
throws VtdException, ParseException {
|
||||
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
|
||||
}
|
||||
|
||||
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
|
||||
String idAttributeName)
|
||||
throws VtdException, ParseException {
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(bytes);
|
||||
vg.parse(true);
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
ap.declareXPathNameSpace(ns, nsUrl);
|
||||
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap, vn, xpath, Arrays.asList(idAttributeName));
|
||||
if (!recordNodes.isEmpty()) {
|
||||
return (recordNodes.get(0).getAttributes().get(idAttributeName));
|
||||
}
|
||||
Log.info("id not found - default: " + defaultValue);
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
<workflow-app name="Orcid Download" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPathOrcid</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>token</name>
|
||||
<description>access token</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPathOrcid}/download'/>
|
||||
<mkdir path='${workingPathOrcid}/download'/>
|
||||
</fs>
|
||||
<ok to="DownloadOrcidData"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="DownloadOrcidData">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
|
||||
<arg>-d</arg><arg>${workingPathOrcid}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>last_modified.csv</arg>
|
||||
<arg>-o</arg><arg>download/</arg>
|
||||
<arg>-t</arg><arg>${token}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,232 @@
|
|||
<workflow-app name="Extract Orcid XML Works From Activities" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.java</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx2g</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/xml/works'/>
|
||||
<mkdir path='${workingPath}/xml/works'/>
|
||||
</fs>
|
||||
<ok to="fork_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name = "fork_node">
|
||||
<path start = "ExtractXMLWorkActivities_0"/>
|
||||
<path start = "ExtractXMLWorkActivities_1"/>
|
||||
<path start = "ExtractXMLWorkActivities_2"/>
|
||||
<path start = "ExtractXMLWorkActivities_3"/>
|
||||
<path start = "ExtractXMLWorkActivities_4"/>
|
||||
<path start = "ExtractXMLWorkActivities_5"/>
|
||||
<path start = "ExtractXMLWorkActivities_6"/>
|
||||
<path start = "ExtractXMLWorkActivities_7"/>
|
||||
<path start = "ExtractXMLWorkActivities_8"/>
|
||||
<path start = "ExtractXMLWorkActivities_9"/>
|
||||
<path start = "ExtractXMLWorkActivities_X"/>
|
||||
</fork>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_0">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_0.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_1">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_1.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_2">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_2.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_3">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_3.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_4">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_4.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_5">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_5.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractXMLWorkActivities_6">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_6.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_7">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_7.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractXMLWorkActivities_8">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_8.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_9">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_9.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLWorkActivities_X">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>xml/works/xml_works_X.seq</arg>
|
||||
<arg>-oew</arg><arg>---</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name = "join_node" to = "End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,26 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx8g</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,40 @@
|
|||
<workflow-app name="Extract Orcid XML Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/xml/authors'/>
|
||||
<mkdir path='${workingPath}/xml/authors'/>
|
||||
</fs>
|
||||
<ok to="ExtractXMLAuthorsSummaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractXMLAuthorsSummaries">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>xml/authors/</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,64 @@
|
|||
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>token</name>
|
||||
<description>access token</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd</name>
|
||||
<value>wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar
|
||||
</value>
|
||||
<description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/downloads'/>
|
||||
<mkdir path='${workingPath}/downloads'/>
|
||||
</fs>
|
||||
<ok to="DownloadLambdaFile"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="DownloadLambdaFile">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="DownloadOrcidData">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
|
||||
<arg>-d</arg><arg>${workingPathOrcid}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>last_modified.csv</arg>
|
||||
<arg>-o</arg><arg>download/</arg>
|
||||
<arg>-t</arg><arg>${token}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue