orcid-no-doi #43
|
@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
||||||
public void generateAuthorsDOIsData() throws Exception {
|
public void generateAuthorsDOIsData() throws Exception {
|
||||||
Configuration conf = initConfigurationObject();
|
Configuration conf = initConfigurationObject();
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||||
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
|
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
|
||||||
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
|
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
Log.info("Default Path: " + workingPath);
|
||||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
|
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
|
||||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
public class OrcidDSManager {
|
public class OrcidDSManager {
|
||||||
|
|
||||||
protected String hdfsServerUri;
|
protected String hdfsServerUri;
|
||||||
protected String hdfsOrcidDefaultPath;
|
protected String workingPath;
|
||||||
private String summariesFileNameTarGz;
|
private String summariesFileNameTarGz;
|
||||||
private String outputAuthorsPath;
|
private String outputAuthorsPath;
|
||||||
|
|
||||||
|
@ -28,10 +28,10 @@ public class OrcidDSManager {
|
||||||
public void generateAuthors() throws Exception {
|
public void generateAuthors() throws Exception {
|
||||||
Configuration conf = initConfigurationObject();
|
Configuration conf = initConfigurationObject();
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
|
||||||
Path outputPath = new Path(
|
Path outputPath = new Path(
|
||||||
hdfsServerUri
|
hdfsServerUri
|
||||||
.concat(hdfsOrcidDefaultPath)
|
.concat(workingPath)
|
||||||
.concat(outputAuthorsPath)
|
.concat(outputAuthorsPath)
|
||||||
.concat("authors.seq"));
|
.concat("authors.seq"));
|
||||||
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
||||||
|
@ -41,7 +41,7 @@ public class OrcidDSManager {
|
||||||
// ====== Init HDFS File System Object
|
// ====== Init HDFS File System Object
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
// Set FileSystem URI
|
// Set FileSystem URI
|
||||||
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
|
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
|
||||||
// Because of Maven
|
// Because of Maven
|
||||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||||
|
@ -52,7 +52,7 @@ public class OrcidDSManager {
|
||||||
// Get the filesystem - HDFS
|
// Get the filesystem - HDFS
|
||||||
FileSystem fs = null;
|
FileSystem fs = null;
|
||||||
try {
|
try {
|
||||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
|
fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// TODO Auto-generated catch block
|
// TODO Auto-generated catch block
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|||||||
|
@ -71,8 +71,8 @@ public class OrcidDSManager {
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
Log.info("Working Path: " + workingPath);
|
||||||
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||||
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||||
outputAuthorsPath = parser.get("outputAuthorsPath");
|
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||||
|
|
|
@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
long startDownload = 0;
|
long startDownload = 0;
|
||||||
Configuration conf = initConfigurationObject();
|
Configuration conf = initConfigurationObject();
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
|
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
||||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||||
Path hdfsoutputPath = new Path(
|
Path hdfsoutputPath = new Path(
|
||||||
hdfsServerUri
|
hdfsServerUri
|
||||||
.concat(hdfsOrcidDefaultPath)
|
.concat(workingPath)
|
||||||
.concat(outputPath)
|
.concat(outputPath)
|
||||||
.concat("orcid_records.seq"));
|
.concat("orcid_records.seq"));
|
||||||
|
|
||||||
|
@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
Log.info("Default Path: " + workingPath);
|
||||||
lambdaFileName = parser.get("lambdaFileName");
|
lambdaFileName = parser.get("lambdaFileName");
|
||||||
Log.info("Lambda File Name: " + lambdaFileName);
|
Log.info("Lambda File Name: " + lambdaFileName);
|
||||||
outputPath = parser.get("outputPath");
|
outputPath = parser.get("outputPath");
|
||||||
|
|
|
@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
|
||||||
public class ActivitiesDumpReader {
|
public class ActivitiesDumpReader {
|
||||||
|
|
||||||
private static final int MAX_XML_WORKS_PARSED = 100;
|
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||||
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10;
|
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||||
|
|
||||||
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
@ -127,7 +127,7 @@ public class ActivitiesDumpReader {
|
||||||
Log
|
Log
|
||||||
.warn(
|
.warn(
|
||||||
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
||||||
Log.warn(e);
|
// Log.warn(e);
|
||||||
}
|
}
|
||||||
claudio.atzori
commented
What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records. What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.
|
|||||||
|
|
||||||
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||||
|
|
|
@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||||
|
|
||||||
private String activitiesFileNameTarGz;
|
private String activitiesFileNameTarGz;
|
||||||
private String outputWorksPath;
|
private String outputWorksPath;
|
||||||
private String workingPath;
|
// private String workingPath;
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, Exception {
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||||
|
@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
workingPath = parser.get("workingPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Working Path: " + workingPath);
|
Log.info("Working Path: " + workingPath);
|
||||||
hdfsOrcidDefaultPath = workingPath;
|
|
||||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
outputWorksPath = parser.get("outputWorksPath");
|
outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
|
|
@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -24,9 +25,11 @@ import com.google.gson.JsonElement;
|
||||||
import com.google.gson.JsonParser;
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||||
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
|
||||||
Dataset<AuthorData> summariesDataset = spark
|
Dataset<AuthorData> summariesDataset = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.toJavaRDD();
|
.toJavaRDD();
|
||||||
logger.info("Works enriched data created: " + enrichedWorksRDD.count());
|
logger.info("Works enriched data created: " + enrichedWorksRDD.count());
|
||||||
enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
||||||
logger.info("Works enriched data saved");
|
logger.info("Works enriched data saved");
|
||||||
|
JavaRDD<Tuple2<String, Publication>> oafPublicationRDD = enrichedWorksRDD.map(e -> {
|
||||||
|
JsonElement j = new JsonParser().parse(e._2());
|
||||||
|
return new Tuple2<>(e._1(), (Publication) PublicationToOaf
|
||||||
|
.generatePublicationActionsFromDump(j.getAsJsonObject()));
|
||||||
|
});
|
||||||
|
|
||||||
|
Dataset<Tuple2<String, Publication>> publicationDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
oafPublicationRDD.repartition(1).rdd(),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class)));
|
||||||
|
publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -172,7 +172,7 @@ public class PublicationToOaf {
|
||||||
instance.setUrl(urls);
|
instance.setUrl(urls);
|
||||||
}
|
}
|
||||||
|
|
||||||
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
||||||
if (StringUtils.isNotBlank(pubDate)) {
|
if (StringUtils.isNotBlank(pubDate)) {
|
||||||
instance.setDateofacceptance(mapStringField(pubDate, null));
|
instance.setDateofacceptance(mapStringField(pubDate, null));
|
||||||
}
|
}
|
||||||
claudio.atzori
commented
Is the caller expecting the Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
enrico.ottonello
commented
yes, there is a check on null value yes, there is a check on null value
|
|||||||
|
@ -325,7 +325,12 @@ public class PublicationToOaf {
|
||||||
private static String getPublicationDate(final JsonObject rootElement,
|
private static String getPublicationDate(final JsonObject rootElement,
|
||||||
final String jsonKey) {
|
final String jsonKey) {
|
||||||
|
|
||||||
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
JsonObject pubDateJson = null;
|
||||||
|
try {
|
||||||
|
pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
||||||
|
} catch (Exception e) {
|
||||||
claudio.atzori
commented
Is the caller expecting the Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
enrico.ottonello
commented
yes, there is a check on null value yes, there is a check on null value
|
|||||||
|
return null;
|
||||||
|
}
|
||||||
if (pubDateJson == null) {
|
if (pubDateJson == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
claudio.atzori
commented
Is the caller expecting the Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
enrico.ottonello
commented
yes, there is a check on null value yes, there is a check on null value
|
|||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
|
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
|
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||||
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}
|
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}
|
||||||
|
|
|
@ -1,75 +1,9 @@
|
||||||
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath_activities</name>
|
<name>workingPath</name>
|
||||||
<description>the working dir base path</description>
|
<description>the working dir base path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>shell_cmd_0</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_1</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_2</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_3</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_4</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_5</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_6</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_7</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_8</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_9</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>shell_cmd_X</name>
|
|
||||||
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
|
||||||
</value>
|
|
||||||
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="ResetWorkingPath"/>
|
||||||
|
@ -80,437 +14,12 @@
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
<action name="ResetWorkingPath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${workingPath_activities}/no_doi_works/*'/>
|
<delete path='${workingPath}/no_doi_enriched_works/output'/>
|
||||||
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
|
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="fork_gen_orcid_author_work"/>
|
<ok to="Gen_Enriched_Orcid_Works"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name = "fork_gen_orcid_author_work">
|
|
||||||
<path start = "check_exist_on_hdfs_activities_0"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_1"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_2"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_3"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_4"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_5"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_6"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_7"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_8"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_9"/>
|
|
||||||
<path start = "check_exist_on_hdfs_activities_X"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_0">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_0">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_0" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_0">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_0}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_0"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_0">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_1">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_1">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_1" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_1">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_1}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_1"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_1">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_2">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_2">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_2" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_2">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_2}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_2"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_2">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_3">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_3">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_3" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_3">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_3}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_3"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_3">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_4">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_4">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_4" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_4">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_4}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_4"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_4">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_5">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_5">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_5" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_5">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_5}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_5"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_5">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_6">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_6">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_6" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_6">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_6}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_6"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_6">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_7">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_7">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_7" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_7">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_7}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_7"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_7">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_8">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_8">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_8" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_8">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_8}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_8"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_8">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_9">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_9">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_9" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_9">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_9}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_9"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_9">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<decision name="check_exist_on_hdfs_activities_X">
|
|
||||||
<switch>
|
|
||||||
<case to="GenOrcidAuthorWork_X">
|
|
||||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
|
|
||||||
</case>
|
|
||||||
<default to="Download_X" />
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="Download_X">
|
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<exec>bash</exec>
|
|
||||||
<argument>-c</argument>
|
|
||||||
<argument>${shell_cmd_X}</argument>
|
|
||||||
<capture-output/>
|
|
||||||
</shell>
|
|
||||||
<ok to="GenOrcidAuthorWork_X"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="GenOrcidAuthorWork_X">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
|
||||||
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="join_node"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
|
|
||||||
|
|
||||||
<action name="Gen_Enriched_Orcid_Works">
|
<action name="Gen_Enriched_Orcid_Works">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>the working dir base path</description>
|
<description>the working dir base path</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="ResetWorkingPath"/>
|
||||||
|
@ -15,24 +21,44 @@
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
<action name="ResetWorkingPath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${workingPath}/output'/>
|
<delete path='${workingPath}/summaries/output'/>
|
||||||
<mkdir path='${workingPath}/output'/>
|
<mkdir path='${workingPath}/summaries/output'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="ImportOrcidSummary"/>
|
<ok to="check_exist_on_hdfs_summaries"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_summaries">
|
||||||
|
<switch>
|
||||||
|
<case to="ImportOrcidSummaries">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="DownloadSummaries" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="DownloadSummaries">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="ImportOrcidSummaries"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="ImportOrcidSummary">
|
<action name="ImportOrcidSummaries">
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||||
<arg>-d</arg><arg>${workingPath}/</arg>
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||||
<arg>-o</arg><arg>output/</arg>
|
<arg>-o</arg><arg>summaries/output/</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.java</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx4g</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,514 @@
|
||||||
|
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_1</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_2</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_3</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_4</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_5</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_6</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_7</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_8</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_9</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_X</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/no_doi_works/*'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="fork_gen_orcid_author_work"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name = "fork_gen_orcid_author_work">
|
||||||
|
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_0">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_0">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_0" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_0">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_0"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_0">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_1">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_1">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_1" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_1">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_1}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_1"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_1">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_2">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_2">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_2" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_2">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_2}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_2">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_3">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_3">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_3" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_3">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_3}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_3"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_3">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_4">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_4">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_4" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_4">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_4}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_4"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_4">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_5">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_5">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_5" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_5">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_5}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_5"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_5">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_6">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_6">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_6" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_6">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_6}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_6"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_6">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_7">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_7">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_7" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_7">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_7}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_7"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_7">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_8">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_8">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_8" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_8">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_8}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_8"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_8">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_9">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_9">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_9" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_9">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_9}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_9"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_9">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_X">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_X">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_X" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_X">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_X}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_X"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_X">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name = "join_node" to = "End"/>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,22 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,68 @@
|
||||||
|
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/summaries/output'/>
|
||||||
|
<mkdir path='${workingPath}/summaries/output'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="check_exist_on_hdfs_summaries"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_summaries">
|
||||||
|
<switch>
|
||||||
|
<case to="ImportOrcidSummaries">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="DownloadSummaries" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="DownloadSummaries">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="ImportOrcidSummaries"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ImportOrcidSummaries">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||||
|
<arg>-o</arg><arg>summaries/output/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
import java.nio.file.Files;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class OrcidClientTest {
|
public class OrcidClientTest {
|
||||||
final String orcidId = "0000-0001-7291-3210";
|
final String orcidId = "0000-0001-7291-3210";
|
||||||
|
@ -36,7 +36,16 @@ public class OrcidClientTest {
|
||||||
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
||||||
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
||||||
|
|
||||||
public String testDownloadRecord(String orcidId) throws Exception {
|
@Test
|
||||||
|
public void downloadTest() throws Exception {
|
||||||
|
String record = testDownloadRecord("0000-0002-2536-4498");
|
||||||
|
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
System.out.println("saved to tmp");
|
||||||
|
}
|
||||||
|
|
||||||
|
private String testDownloadRecord(String orcidId) throws Exception {
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
|
@ -100,7 +109,7 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
public void getRecordDatestamp() throws ParseException {
|
private void getRecordDatestamp() throws ParseException {
|
||||||
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
|
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
|
||||||
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
|
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
|
||||||
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||||
|
@ -108,7 +117,7 @@ public class OrcidClientTest {
|
||||||
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
|
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDate(String value) throws ParseException {
|
private void testDate(String value) throws ParseException {
|
||||||
System.out.println(value.toString());
|
System.out.println(value.toString());
|
||||||
if (value.length() != 19) {
|
if (value.length() != 19) {
|
||||||
value = value.substring(0, 19);
|
value = value.substring(0, 19);
|
||||||
|
@ -118,14 +127,16 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
public void testModifiedDate() throws ParseException {
|
@Ignore
|
||||||
|
private void testModifiedDate() throws ParseException {
|
||||||
testDate(toRetrieveDate);
|
testDate(toRetrieveDate);
|
||||||
testDate(toNotRetrieveDate);
|
testDate(toNotRetrieveDate);
|
||||||
testDate(shortDate);
|
testDate(shortDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
public void testReadBase64CompressedRecord() throws Exception {
|
@Ignore
|
||||||
|
private void testReadBase64CompressedRecord() throws Exception {
|
||||||
final String base64CompressedRecord = IOUtils
|
final String base64CompressedRecord = IOUtils
|
||||||
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
|
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
|
||||||
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||||
|
|
|
@ -13,14 +13,15 @@ import com.google.gson.JsonParser;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class PublicationToOafTest {
|
public class PublicationToOafTest {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
|
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
// @Ignore
|
@Ignore
|
||||||
public void convertOafPublicationTest() throws Exception {
|
private void convertOafPublicationTest() throws Exception {
|
||||||
String jsonPublication = IOUtils
|
String jsonPublication = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
PublicationToOafTest.class.getResourceAsStream("publication.json"));
|
PublicationToOafTest.class.getResourceAsStream("publication.json"));
|
||||||
|
|
|
@ -42,12 +42,12 @@ public class OrcidNoDoiTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Ignore
|
||||||
private void readPublicationFieldsTest()
|
public void readPublicationFieldsTest()
|
||||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
logger.info("running loadPublicationFieldsTest ....");
|
logger.info("running loadPublicationFieldsTest ....");
|
||||||
String xml = IOUtils
|
String xml = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
|
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
|
||||||
|
|
||||||
if (xml == null) {
|
if (xml == null) {
|
||||||
logger.info("Resource not found");
|
logger.info("Resource not found");
|
||||||
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="63461376"
|
||||||
|
path="/0000-0002-2536-4498/work/63461376" visibility="public">
|
||||||
|
<common:created-date>2019-10-22T03:18:13.755Z</common:created-date>
|
||||||
|
<common:last-modified-date>2020-06-17T11:07:13.703Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-client-id>
|
||||||
|
<common:uri>https://orcid.org/client/0000-0001-8607-8906</common:uri>
|
||||||
|
<common:path>0000-0001-8607-8906</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-client-id>
|
||||||
|
<common:source-name>INSPIRE-HEP</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector</common:title>
|
||||||
|
</work:title>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>other-id</common:external-id-type>
|
||||||
|
<common:external-id-value>1759875</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">1759875</common:external-id-normalized>
|
||||||
|
<common:external-id-url>http://inspirehep.net/record/1759875</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>doi</common:external-id-type>
|
||||||
|
<common:external-id-value>10.1140/epjc/s10052-020-7907-9</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">10.1140/epjc/s10052-020-7907-9</common:external-id-normalized>
|
||||||
|
<common:external-id-url>http://dx.doi.org/10.1140/epjc/s10052-020-7907-9</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>arxiv</common:external-id-type>
|
||||||
|
<common:external-id-value>1910.08819</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">arXiv:1910.08819</common:external-id-normalized>
|
||||||
|
<common:external-id-url>http://arxiv.org/abs/1910.08819</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:url>http://inspirehep.net/record/1759875</common:url>
|
||||||
|
<work:type>journal-article</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2020</common:year>
|
||||||
|
<common:month>06</common:month>
|
||||||
|
<common:day>12</common:day>
|
||||||
|
</common:publication-date>
|
||||||
|
<work:journal-title>Eur.Phys.J.C</work:journal-title>
|
||||||
|
</work:work>
|
Loading…
Reference in New Issue
Let the exception propagate and break the job