orcid-no-doi #43
|
@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
|||
public void generateAuthorsDOIsData() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
|
||||
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
|
||||
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
|
||||
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
|
||||
}
|
||||
|
||||
|
@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
|||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Default Path: " + workingPath);
|
||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
|
||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
public class OrcidDSManager {
|
||||
|
||||
protected String hdfsServerUri;
|
||||
protected String hdfsOrcidDefaultPath;
|
||||
protected String workingPath;
|
||||
private String summariesFileNameTarGz;
|
||||
private String outputAuthorsPath;
|
||||
|
||||
|
@ -28,10 +28,10 @@ public class OrcidDSManager {
|
|||
public void generateAuthors() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
|
||||
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
|
||||
Path outputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(hdfsOrcidDefaultPath)
|
||||
.concat(workingPath)
|
||||
.concat(outputAuthorsPath)
|
||||
.concat("authors.seq"));
|
||||
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
||||
|
@ -41,7 +41,7 @@ public class OrcidDSManager {
|
|||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
|
||||
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
@ -52,7 +52,7 @@ public class OrcidDSManager {
|
|||
// Get the filesystem - HDFS
|
||||
FileSystem fs = null;
|
||||
try {
|
||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
|
||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
|
||||
|
@ -71,8 +71,8 @@ public class OrcidDSManager {
|
|||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Working Path: " + workingPath);
|
||||
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||
|
|
|
@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
long startDownload = 0;
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
|
||||
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||
Path hdfsoutputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(hdfsOrcidDefaultPath)
|
||||
.concat(workingPath)
|
||||
.concat(outputPath)
|
||||
.concat("orcid_records.seq"));
|
||||
|
||||
|
@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Default Path: " + workingPath);
|
||||
lambdaFileName = parser.get("lambdaFileName");
|
||||
Log.info("Lambda File Name: " + lambdaFileName);
|
||||
outputPath = parser.get("outputPath");
|
||||
|
|
|
@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
|||
|
||||
public class ActivitiesDumpReader {
|
||||
|
||||
private static final int MAX_XML_WORKS_PARSED = 100;
|
||||
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10;
|
||||
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||
|
||||
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
|
@ -127,7 +127,7 @@ public class ActivitiesDumpReader {
|
|||
Log
|
||||
.warn(
|
||||
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
||||
Log.warn(e);
|
||||
// Log.warn(e);
|
||||
}
|
||||
claudio.atzori
commented
What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records. What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.
|
||||
|
||||
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||
|
|
|
@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
|
|||
|
||||
private String activitiesFileNameTarGz;
|
||||
private String outputWorksPath;
|
||||
private String workingPath;
|
||||
// private String workingPath;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||
|
@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
|
|||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Working Path: " + workingPath);
|
||||
hdfsOrcidDefaultPath = workingPath;
|
||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||
outputWorksPath = parser.get("outputWorksPath");
|
||||
|
|
|
@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -24,9 +25,11 @@ import com.google.gson.JsonElement;
|
|||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
||||
.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
|
||||
Dataset<AuthorData> summariesDataset = spark
|
||||
.createDataset(
|
||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||
|
@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
.filter(Objects::nonNull)
|
||||
.toJavaRDD();
|
||||
logger.info("Works enriched data created: " + enrichedWorksRDD.count());
|
||||
enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
||||
enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
||||
logger.info("Works enriched data saved");
|
||||
JavaRDD<Tuple2<String, Publication>> oafPublicationRDD = enrichedWorksRDD.map(e -> {
|
||||
JsonElement j = new JsonParser().parse(e._2());
|
||||
return new Tuple2<>(e._1(), (Publication) PublicationToOaf
|
||||
.generatePublicationActionsFromDump(j.getAsJsonObject()));
|
||||
});
|
||||
|
||||
Dataset<Tuple2<String, Publication>> publicationDataset = spark
|
||||
.createDataset(
|
||||
oafPublicationRDD.repartition(1).rdd(),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class)));
|
||||
publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -172,7 +172,7 @@ public class PublicationToOaf {
|
|||
instance.setUrl(urls);
|
||||
}
|
||||
|
||||
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
||||
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
||||
if (StringUtils.isNotBlank(pubDate)) {
|
||||
instance.setDateofacceptance(mapStringField(pubDate, null));
|
||||
}
|
||||
claudio.atzori
commented
Is the caller expecting the Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
enrico.ottonello
commented
yes, there is a check on null value yes, there is a check on null value
|
||||
|
@ -325,7 +325,12 @@ public class PublicationToOaf {
|
|||
private static String getPublicationDate(final JsonObject rootElement,
|
||||
final String jsonKey) {
|
||||
|
||||
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
||||
JsonObject pubDateJson = null;
|
||||
try {
|
||||
pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
||||
} catch (Exception e) {
|
||||
claudio.atzori
commented
Is the caller expecting the Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
enrico.ottonello
commented
yes, there is a check on null value yes, there is a check on null value
|
||||
return null;
|
||||
}
|
||||
if (pubDateJson == null) {
|
||||
return null;
|
||||
}
|
||||
claudio.atzori
commented
Is the caller expecting the Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
enrico.ottonello
commented
yes, there is a check on null value yes, there is a check on null value
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
||||
]
|
|
@ -1,6 +1,6 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
||||
]
|
|
@ -1,6 +1,6 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}
|
||||
|
|
|
@ -1,75 +1,9 @@
|
|||
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath_activities</name>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_1</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_2</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_3</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_4</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_5</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_6</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_7</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_8</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_9</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_X</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
@ -80,436 +14,11 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath_activities}/no_doi_works/*'/>
|
||||
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
|
||||
<delete path='${workingPath}/no_doi_enriched_works/output'/>
|
||||
</fs>
|
||||
<ok to="fork_gen_orcid_author_work"/>
|
||||
<ok to="Gen_Enriched_Orcid_Works"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name = "fork_gen_orcid_author_work">
|
||||
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||
</fork>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_0">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_0">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_0" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_0">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_0"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_0">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_1">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_1">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_1" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_1}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_1">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_2">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_2">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_2" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_2">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_2}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_2">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_3">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_3">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_3" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_3">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_3}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_3"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_3">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_4">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_4">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_4" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_4">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_4}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_4"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_4">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_5">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_5">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_5" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_5">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_5}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_5">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_6">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_6">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_6" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_6">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_6}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_6"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_6">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_7">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_7">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_7" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_7">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_7}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_7"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_7">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_8">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_8">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_8" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_8">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_8}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_8"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_8">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_9">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_9">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_9" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_9">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_9}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_9"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_9">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_X">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_X">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_X" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_X">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_X}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_X"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_X">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
|
||||
|
||||
<action name="Gen_Enriched_Orcid_Works">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
@ -15,24 +21,44 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/output'/>
|
||||
<mkdir path='${workingPath}/output'/>
|
||||
<delete path='${workingPath}/summaries/output'/>
|
||||
<mkdir path='${workingPath}/summaries/output'/>
|
||||
</fs>
|
||||
<ok to="ImportOrcidSummary"/>
|
||||
<ok to="check_exist_on_hdfs_summaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_summaries">
|
||||
<switch>
|
||||
<case to="ImportOrcidSummaries">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
|
||||
</case>
|
||||
<default to="DownloadSummaries" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="DownloadSummaries">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="ImportOrcidSummaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<action name="ImportOrcidSummary">
|
||||
<action name="ImportOrcidSummaries">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||
<arg>-d</arg><arg>${workingPath}/</arg>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>output/</arg>
|
||||
<arg>-o</arg><arg>summaries/output/</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.java</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx4g</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,514 @@
|
|||
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_1</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_2</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_3</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_4</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_5</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_6</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_7</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_8</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_9</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_X</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/no_doi_works/*'/>
|
||||
</fs>
|
||||
<ok to="fork_gen_orcid_author_work"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name = "fork_gen_orcid_author_work">
|
||||
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||
</fork>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_0">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_0">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_0" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_0">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_0"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_0">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_1">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_1">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_1" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_1}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_1">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_2">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_2">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_2" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_2">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_2}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_2">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_3">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_3">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_3" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_3">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_3}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_3"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_3">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_4">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_4">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_4" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_4">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_4}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_4"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_4">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_5">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_5">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_5" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_5">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_5}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_5">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_6">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_6">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_6" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_6">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_6}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_6"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_6">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_7">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_7">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_7" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_7">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_7}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_7"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_7">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_8">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_8">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_8" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_8">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_8}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_8"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_8">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_9">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_9">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_9" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_9">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_9}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_9"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_9">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_X">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_X">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_X" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_X">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_X}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_X"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_X">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name = "join_node" to = "End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,22 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,68 @@
|
|||
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/summaries/output'/>
|
||||
<mkdir path='${workingPath}/summaries/output'/>
|
||||
</fs>
|
||||
<ok to="check_exist_on_hdfs_summaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_summaries">
|
||||
<switch>
|
||||
<case to="ImportOrcidSummaries">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
|
||||
</case>
|
||||
<default to="DownloadSummaries" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="DownloadSummaries">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="ImportOrcidSummaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOrcidSummaries">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>summaries/output/</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid;
|
|||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
|
@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients;
|
|||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class OrcidClientTest {
|
||||
final String orcidId = "0000-0001-7291-3210";
|
||||
|
@ -32,11 +32,20 @@ public class OrcidClientTest {
|
|||
String lastUpdate = "2019-09-30 00:00:00";
|
||||
String shortDate = "2020-05-06 16:06:11";
|
||||
|
||||
// curl -i -H "Accept: application/vnd.orcid+xml"
|
||||
// curl -i -H "Accept: application/vnd.orcid+xml"
|
||||
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
||||
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
||||
|
||||
public String testDownloadRecord(String orcidId) throws Exception {
|
||||
@Test
|
||||
public void downloadTest() throws Exception {
|
||||
String record = testDownloadRecord("0000-0002-2536-4498");
|
||||
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
|
||||
OutputStream outStream = new FileOutputStream(f);
|
||||
IOUtils.write(record.getBytes(), outStream);
|
||||
System.out.println("saved to tmp");
|
||||
}
|
||||
|
||||
private String testDownloadRecord(String orcidId) throws Exception {
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
|
@ -100,7 +109,7 @@ public class OrcidClientTest {
|
|||
}
|
||||
|
||||
// @Test
|
||||
public void getRecordDatestamp() throws ParseException {
|
||||
private void getRecordDatestamp() throws ParseException {
|
||||
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
|
||||
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
|
||||
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||
|
@ -108,7 +117,7 @@ public class OrcidClientTest {
|
|||
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
|
||||
}
|
||||
|
||||
public void testDate(String value) throws ParseException {
|
||||
private void testDate(String value) throws ParseException {
|
||||
System.out.println(value.toString());
|
||||
if (value.length() != 19) {
|
||||
value = value.substring(0, 19);
|
||||
|
@ -118,14 +127,16 @@ public class OrcidClientTest {
|
|||
}
|
||||
|
||||
// @Test
|
||||
public void testModifiedDate() throws ParseException {
|
||||
@Ignore
|
||||
private void testModifiedDate() throws ParseException {
|
||||
testDate(toRetrieveDate);
|
||||
testDate(toNotRetrieveDate);
|
||||
testDate(shortDate);
|
||||
}
|
||||
|
||||
// @Test
|
||||
public void testReadBase64CompressedRecord() throws Exception {
|
||||
@Ignore
|
||||
private void testReadBase64CompressedRecord() throws Exception {
|
||||
final String base64CompressedRecord = IOUtils
|
||||
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
|
||||
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||
|
|
|
@ -13,14 +13,15 @@ import com.google.gson.JsonParser;
|
|||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class PublicationToOafTest {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
|
||||
|
||||
@Test
|
||||
// @Ignore
|
||||
public void convertOafPublicationTest() throws Exception {
|
||||
@Ignore
|
||||
private void convertOafPublicationTest() throws Exception {
|
||||
String jsonPublication = IOUtils
|
||||
.toString(
|
||||
PublicationToOafTest.class.getResourceAsStream("publication.json"));
|
||||
|
|
|
@ -42,12 +42,12 @@ public class OrcidNoDoiTest {
|
|||
|
||||
@Test
|
||||
@Ignore
|
||||
private void readPublicationFieldsTest()
|
||||
public void readPublicationFieldsTest()
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
logger.info("running loadPublicationFieldsTest ....");
|
||||
String xml = IOUtils
|
||||
.toString(
|
||||
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
|
||||
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
|
||||
|
||||
if (xml == null) {
|
||||
logger.info("Resource not found");
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||
xmlns:education="http://www.orcid.org/ns/education"
|
||||
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||
xmlns:service="http://www.orcid.org/ns/service"
|
||||
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||
xmlns:person="http://www.orcid.org/ns/person"
|
||||
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||
xmlns:error="http://www.orcid.org/ns/error"
|
||||
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||
xmlns:work="http://www.orcid.org/ns/work"
|
||||
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="63461376"
|
||||
path="/0000-0002-2536-4498/work/63461376" visibility="public">
|
||||
<common:created-date>2019-10-22T03:18:13.755Z</common:created-date>
|
||||
<common:last-modified-date>2020-06-17T11:07:13.703Z</common:last-modified-date>
|
||||
<common:source>
|
||||
<common:source-client-id>
|
||||
<common:uri>https://orcid.org/client/0000-0001-8607-8906</common:uri>
|
||||
<common:path>0000-0001-8607-8906</common:path>
|
||||
<common:host>orcid.org</common:host>
|
||||
</common:source-client-id>
|
||||
<common:source-name>INSPIRE-HEP</common:source-name>
|
||||
</common:source>
|
||||
<work:title>
|
||||
<common:title>Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector</common:title>
|
||||
</work:title>
|
||||
<common:external-ids>
|
||||
<common:external-id>
|
||||
<common:external-id-type>other-id</common:external-id-type>
|
||||
<common:external-id-value>1759875</common:external-id-value>
|
||||
<common:external-id-normalized transient="true">1759875</common:external-id-normalized>
|
||||
<common:external-id-url>http://inspirehep.net/record/1759875</common:external-id-url>
|
||||
<common:external-id-relationship>self</common:external-id-relationship>
|
||||
</common:external-id>
|
||||
<common:external-id>
|
||||
<common:external-id-type>doi</common:external-id-type>
|
||||
<common:external-id-value>10.1140/epjc/s10052-020-7907-9</common:external-id-value>
|
||||
<common:external-id-normalized transient="true">10.1140/epjc/s10052-020-7907-9</common:external-id-normalized>
|
||||
<common:external-id-url>http://dx.doi.org/10.1140/epjc/s10052-020-7907-9</common:external-id-url>
|
||||
<common:external-id-relationship>self</common:external-id-relationship>
|
||||
</common:external-id>
|
||||
<common:external-id>
|
||||
<common:external-id-type>arxiv</common:external-id-type>
|
||||
<common:external-id-value>1910.08819</common:external-id-value>
|
||||
<common:external-id-normalized transient="true">arXiv:1910.08819</common:external-id-normalized>
|
||||
<common:external-id-url>http://arxiv.org/abs/1910.08819</common:external-id-url>
|
||||
<common:external-id-relationship>self</common:external-id-relationship>
|
||||
</common:external-id>
|
||||
</common:external-ids>
|
||||
<common:url>http://inspirehep.net/record/1759875</common:url>
|
||||
<work:type>journal-article</work:type>
|
||||
<common:publication-date>
|
||||
<common:year>2020</common:year>
|
||||
<common:month>06</common:month>
|
||||
<common:day>12</common:day>
|
||||
</common:publication-date>
|
||||
<work:journal-title>Eur.Phys.J.C</work:journal-title>
|
||||
</work:work>
|
Loading…
Reference in New Issue
Let the exception propagate and break the job