separate workflow to parse orcid summaries, activities and generate dataset with no doi publications; test

This commit is contained in:
Enrico Ottonello 2020-07-03 23:30:31 +02:00
parent 1729cc5cf3
commit ca37d3427b
20 changed files with 815 additions and 543 deletions

View File

@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
public void generateAuthorsDOIsData() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
}
@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
workingPath = parser.get("workingPath");
Log.info("Default Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");

View File

@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDSManager {
protected String hdfsServerUri;
protected String hdfsOrcidDefaultPath;
protected String workingPath;
private String summariesFileNameTarGz;
private String outputAuthorsPath;
@ -28,10 +28,10 @@ public class OrcidDSManager {
public void generateAuthors() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(hdfsOrcidDefaultPath)
.concat(workingPath)
.concat(outputAuthorsPath)
.concat("authors.seq"));
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
@ -41,7 +41,7 @@ public class OrcidDSManager {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
@ -52,7 +52,7 @@ public class OrcidDSManager {
// Get the filesystem - HDFS
FileSystem fs = null;
try {
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
@ -71,8 +71,8 @@ public class OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
Log.info("Summaries File Name: " + summariesFileNameTarGz);
outputAuthorsPath = parser.get("outputAuthorsPath");

View File

@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager {
long startDownload = 0;
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
Path hdfsreadpath = new Path(lambdaFileUri);
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
Path hdfsoutputPath = new Path(
hdfsServerUri
.concat(hdfsOrcidDefaultPath)
.concat(workingPath)
.concat(outputPath)
.concat("orcid_records.seq"));
@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
workingPath = parser.get("workingPath");
Log.info("Default Path: " + workingPath);
lambdaFileName = parser.get("lambdaFileName");
Log.info("Lambda File Name: " + lambdaFileName);
outputPath = parser.get("outputPath");

View File

@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
public class ActivitiesDumpReader {
private static final int MAX_XML_WORKS_PARSED = 100;
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10;
private static final int MAX_XML_WORKS_PARSED = -1;
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
throws Exception {
@ -127,7 +127,7 @@ public class ActivitiesDumpReader {
Log
.warn(
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
Log.warn(e);
// Log.warn(e);
}
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {

View File

@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
private String activitiesFileNameTarGz;
private String outputWorksPath;
private String workingPath;
// private String workingPath;
public static void main(String[] args) throws IOException, Exception {
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
hdfsOrcidDefaultPath = workingPath;
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputWorksPath = parser.get("outputWorksPath");

View File

@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -24,9 +25,11 @@ import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import scala.Tuple2;
@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark
.createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks {
.filter(Objects::nonNull)
.toJavaRDD();
logger.info("Works enriched data created: " + enrichedWorksRDD.count());
enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath);
enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
logger.info("Works enriched data saved");
JavaRDD<Tuple2<String, Publication>> oafPublicationRDD = enrichedWorksRDD.map(e -> {
JsonElement j = new JsonParser().parse(e._2());
return new Tuple2<>(e._1(), (Publication) PublicationToOaf
.generatePublicationActionsFromDump(j.getAsJsonObject()));
});
Dataset<Tuple2<String, Publication>> publicationDataset = spark
.createDataset(
oafPublicationRDD.repartition(1).rdd(),
Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class)));
publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
});
}

View File

@ -172,7 +172,7 @@ public class PublicationToOaf {
instance.setUrl(urls);
}
final String pubDate = getPublicationDate(rootElement, "publication_date");
final String pubDate = getPublicationDate(rootElement, "publicationDates");
if (StringUtils.isNotBlank(pubDate)) {
instance.setDateofacceptance(mapStringField(pubDate, null));
}
@ -325,7 +325,12 @@ public class PublicationToOaf {
private static String getPublicationDate(final JsonObject rootElement,
final String jsonKey) {
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
JsonObject pubDateJson = null;
try {
pubDateJson = rootElement.getAsJsonObject(jsonKey);
} catch (Exception e) {
return null;
}
if (pubDateJson == null) {
return null;
}

View File

@ -1,6 +1,6 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
]

View File

@ -1,6 +1,6 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
]

View File

@ -1,6 +1,6 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}

View File

@ -1,75 +1,9 @@
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath_activities</name>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property>
<property>
<name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property>
<property>
<name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property>
<property>
<name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property>
<property>
<name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property>
<property>
<name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property>
<property>
<name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property>
<property>
<name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property>
<property>
<name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property>
<property>
<name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property>
<property>
<name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
@ -80,436 +14,11 @@
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath_activities}/no_doi_works/*'/>
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
<delete path='${workingPath}/no_doi_enriched_works/output'/>
</fs>
<ok to="fork_gen_orcid_author_work"/>
<ok to="Gen_Enriched_Orcid_Works"/>
<error to="Kill"/>
</action>
<fork name = "fork_gen_orcid_author_work">
<path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/>
<path start = "check_exist_on_hdfs_activities_3"/>
<path start = "check_exist_on_hdfs_activities_4"/>
<path start = "check_exist_on_hdfs_activities_5"/>
<path start = "check_exist_on_hdfs_activities_6"/>
<path start = "check_exist_on_hdfs_activities_7"/>
<path start = "check_exist_on_hdfs_activities_8"/>
<path start = "check_exist_on_hdfs_activities_9"/>
<path start = "check_exist_on_hdfs_activities_X"/>
</fork>
<decision name="check_exist_on_hdfs_activities_0">
<switch>
<case to="GenOrcidAuthorWork_0">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
</case>
<default to="Download_0" />
</switch>
</decision>
<action name="Download_0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_0"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_1">
<switch>
<case to="GenOrcidAuthorWork_1">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
</case>
<default to="Download_1" />
</switch>
</decision>
<action name="Download_1">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_1}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_1"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_2">
<switch>
<case to="GenOrcidAuthorWork_2">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
</case>
<default to="Download_2" />
</switch>
</decision>
<action name="Download_2">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_2}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_2"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_3">
<switch>
<case to="GenOrcidAuthorWork_3">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
</case>
<default to="Download_3" />
</switch>
</decision>
<action name="Download_3">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_3}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_3"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_4">
<switch>
<case to="GenOrcidAuthorWork_4">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
</case>
<default to="Download_4" />
</switch>
</decision>
<action name="Download_4">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_4}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_4"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_5">
<switch>
<case to="GenOrcidAuthorWork_5">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
</case>
<default to="Download_5" />
</switch>
</decision>
<action name="Download_5">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_5}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_5"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_6">
<switch>
<case to="GenOrcidAuthorWork_6">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
</case>
<default to="Download_6" />
</switch>
</decision>
<action name="Download_6">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_6}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_6"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_7">
<switch>
<case to="GenOrcidAuthorWork_7">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
</case>
<default to="Download_7" />
</switch>
</decision>
<action name="Download_7">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_7}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_7"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_8">
<switch>
<case to="GenOrcidAuthorWork_8">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
</case>
<default to="Download_8" />
</switch>
</decision>
<action name="Download_8">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_8}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_8"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_9">
<switch>
<case to="GenOrcidAuthorWork_9">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
</case>
<default to="Download_9" />
</switch>
</decision>
<action name="Download_9">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_9}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_9"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_X">
<switch>
<case to="GenOrcidAuthorWork_X">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
</case>
<default to="Download_X" />
</switch>
</decision>
<action name="Download_X">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_X}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_X"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
<action name="Gen_Enriched_Orcid_Works">
<spark xmlns="uri:oozie:spark-action:0.2">

View File

@ -1,9 +1,15 @@
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
@ -15,24 +21,44 @@
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/output'/>
<mkdir path='${workingPath}/output'/>
<delete path='${workingPath}/summaries/output'/>
<mkdir path='${workingPath}/summaries/output'/>
</fs>
<ok to="ImportOrcidSummary"/>
<ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummary">
<action name="ImportOrcidSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-d</arg><arg>${workingPath}/</arg>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>output/</arg>
<arg>-o</arg><arg>summaries/output/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>

View File

@ -0,0 +1,31 @@
<configuration>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value>
</property>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,514 @@
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property>
<property>
<name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property>
<property>
<name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property>
<property>
<name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property>
<property>
<name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property>
<property>
<name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property>
<property>
<name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property>
<property>
<name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property>
<property>
<name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property>
<property>
<name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property>
<property>
<name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/no_doi_works/*'/>
</fs>
<ok to="fork_gen_orcid_author_work"/>
<error to="Kill"/>
</action>
<fork name = "fork_gen_orcid_author_work">
<path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/>
<path start = "check_exist_on_hdfs_activities_3"/>
<path start = "check_exist_on_hdfs_activities_4"/>
<path start = "check_exist_on_hdfs_activities_5"/>
<path start = "check_exist_on_hdfs_activities_6"/>
<path start = "check_exist_on_hdfs_activities_7"/>
<path start = "check_exist_on_hdfs_activities_8"/>
<path start = "check_exist_on_hdfs_activities_9"/>
<path start = "check_exist_on_hdfs_activities_X"/>
</fork>
<decision name="check_exist_on_hdfs_activities_0">
<switch>
<case to="GenOrcidAuthorWork_0">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
</case>
<default to="Download_0" />
</switch>
</decision>
<action name="Download_0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_0"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_1">
<switch>
<case to="GenOrcidAuthorWork_1">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
</case>
<default to="Download_1" />
</switch>
</decision>
<action name="Download_1">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_1}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_1"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_2">
<switch>
<case to="GenOrcidAuthorWork_2">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
</case>
<default to="Download_2" />
</switch>
</decision>
<action name="Download_2">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_2}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_2"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_3">
<switch>
<case to="GenOrcidAuthorWork_3">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
</case>
<default to="Download_3" />
</switch>
</decision>
<action name="Download_3">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_3}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_3"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_4">
<switch>
<case to="GenOrcidAuthorWork_4">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
</case>
<default to="Download_4" />
</switch>
</decision>
<action name="Download_4">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_4}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_4"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_5">
<switch>
<case to="GenOrcidAuthorWork_5">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
</case>
<default to="Download_5" />
</switch>
</decision>
<action name="Download_5">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_5}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_5"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_6">
<switch>
<case to="GenOrcidAuthorWork_6">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
</case>
<default to="Download_6" />
</switch>
</decision>
<action name="Download_6">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_6}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_6"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_7">
<switch>
<case to="GenOrcidAuthorWork_7">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
</case>
<default to="Download_7" />
</switch>
</decision>
<action name="Download_7">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_7}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_7"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_8">
<switch>
<case to="GenOrcidAuthorWork_8">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
</case>
<default to="Download_8" />
</switch>
</decision>
<action name="Download_8">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_8}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_8"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_9">
<switch>
<case to="GenOrcidAuthorWork_9">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
</case>
<default to="Download_9" />
</switch>
</decision>
<action name="Download_9">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_9}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_9"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_X">
<switch>
<case to="GenOrcidAuthorWork_X">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
</case>
<default to="Download_X" />
</switch>
</decision>
<action name="Download_X">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_X}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_X"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,22 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,68 @@
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/summaries/output'/>
<mkdir path='${workingPath}/summaries/output'/>
</fs>
<ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>summaries/output/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.*;
import java.nio.file.Files;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class OrcidClientTest {
final String orcidId = "0000-0001-7291-3210";
@ -32,11 +32,20 @@ public class OrcidClientTest {
String lastUpdate = "2019-09-30 00:00:00";
String shortDate = "2020-05-06 16:06:11";
// curl -i -H "Accept: application/vnd.orcid+xml"
// curl -i -H "Accept: application/vnd.orcid+xml"
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
public String testDownloadRecord(String orcidId) throws Exception {
@Test
public void downloadTest() throws Exception {
String record = testDownloadRecord("0000-0002-2536-4498");
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
System.out.println("saved to tmp");
}
private String testDownloadRecord(String orcidId) throws Exception {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
@ -100,7 +109,7 @@ public class OrcidClientTest {
}
// @Test
public void getRecordDatestamp() throws ParseException {
private void getRecordDatestamp() throws ParseException {
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
@ -108,7 +117,7 @@ public class OrcidClientTest {
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
}
public void testDate(String value) throws ParseException {
private void testDate(String value) throws ParseException {
System.out.println(value.toString());
if (value.length() != 19) {
value = value.substring(0, 19);
@ -118,14 +127,16 @@ public class OrcidClientTest {
}
// @Test
public void testModifiedDate() throws ParseException {
@Ignore
private void testModifiedDate() throws ParseException {
testDate(toRetrieveDate);
testDate(toNotRetrieveDate);
testDate(shortDate);
}
// @Test
public void testReadBase64CompressedRecord() throws Exception {
@Ignore
private void testReadBase64CompressedRecord() throws Exception {
final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);

View File

@ -13,14 +13,15 @@ import com.google.gson.JsonParser;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class PublicationToOafTest {
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
@Test
// @Ignore
public void convertOafPublicationTest() throws Exception {
@Ignore
private void convertOafPublicationTest() throws Exception {
String jsonPublication = IOUtils
.toString(
PublicationToOafTest.class.getResourceAsStream("publication.json"));

View File

@ -42,12 +42,12 @@ public class OrcidNoDoiTest {
@Test
@Ignore
private void readPublicationFieldsTest()
public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
if (xml == null) {
logger.info("Resource not found");

View File

@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="63461376"
path="/0000-0002-2536-4498/work/63461376" visibility="public">
<common:created-date>2019-10-22T03:18:13.755Z</common:created-date>
<common:last-modified-date>2020-06-17T11:07:13.703Z</common:last-modified-date>
<common:source>
<common:source-client-id>
<common:uri>https://orcid.org/client/0000-0001-8607-8906</common:uri>
<common:path>0000-0001-8607-8906</common:path>
<common:host>orcid.org</common:host>
</common:source-client-id>
<common:source-name>INSPIRE-HEP</common:source-name>
</common:source>
<work:title>
<common:title>Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector</common:title>
</work:title>
<common:external-ids>
<common:external-id>
<common:external-id-type>other-id</common:external-id-type>
<common:external-id-value>1759875</common:external-id-value>
<common:external-id-normalized transient="true">1759875</common:external-id-normalized>
<common:external-id-url>http://inspirehep.net/record/1759875</common:external-id-url>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>doi</common:external-id-type>
<common:external-id-value>10.1140/epjc/s10052-020-7907-9</common:external-id-value>
<common:external-id-normalized transient="true">10.1140/epjc/s10052-020-7907-9</common:external-id-normalized>
<common:external-id-url>http://dx.doi.org/10.1140/epjc/s10052-020-7907-9</common:external-id-url>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>arxiv</common:external-id-type>
<common:external-id-value>1910.08819</common:external-id-value>
<common:external-id-normalized transient="true">arXiv:1910.08819</common:external-id-normalized>
<common:external-id-url>http://arxiv.org/abs/1910.08819</common:external-id-url>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://inspirehep.net/record/1759875</common:url>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2020</common:year>
<common:month>06</common:month>
<common:day>12</common:day>
</common:publication-date>
<work:journal-title>Eur.Phys.J.C</work:journal-title>
</work:work>