orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
20 changed files with 815 additions and 543 deletions
Showing only changes of commit ca37d3427b - Show all commits

View File

@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
public void generateAuthorsDOIsData() throws Exception { public void generateAuthorsDOIsData() throws Exception {
Configuration conf = initConfigurationObject(); Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf); FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz); String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath)); Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath); ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
} }
@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri); Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); workingPath = parser.get("workingPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath); Log.info("Default Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz); Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath"); outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");

View File

@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDSManager { public class OrcidDSManager {
protected String hdfsServerUri; protected String hdfsServerUri;
protected String hdfsOrcidDefaultPath; protected String workingPath;
private String summariesFileNameTarGz; private String summariesFileNameTarGz;
private String outputAuthorsPath; private String outputAuthorsPath;
@ -28,10 +28,10 @@ public class OrcidDSManager {
public void generateAuthors() throws Exception { public void generateAuthors() throws Exception {
Configuration conf = initConfigurationObject(); Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf); FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
Path outputPath = new Path( Path outputPath = new Path(
hdfsServerUri hdfsServerUri
.concat(hdfsOrcidDefaultPath) .concat(workingPath)
.concat(outputAuthorsPath) .concat(outputAuthorsPath)
.concat("authors.seq")); .concat("authors.seq"));
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
@ -41,7 +41,7 @@ public class OrcidDSManager {
// ====== Init HDFS File System Object // ====== Init HDFS File System Object
Configuration conf = new Configuration(); Configuration conf = new Configuration();
// Set FileSystem URI // Set FileSystem URI
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
// Because of Maven // Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
@ -52,7 +52,7 @@ public class OrcidDSManager {
// Get the filesystem - HDFS // Get the filesystem - HDFS
FileSystem fs = null; FileSystem fs = null;
try { try {
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
} catch (IOException e) { } catch (IOException e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
e.printStackTrace(); e.printStackTrace();

Let the exception propagate and break the job

Let the exception propagate and break the job
@ -71,8 +71,8 @@ public class OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri); Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); workingPath = parser.get("workingPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath); Log.info("Working Path: " + workingPath);
summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
Log.info("Summaries File Name: " + summariesFileNameTarGz); Log.info("Summaries File Name: " + summariesFileNameTarGz);
outputAuthorsPath = parser.get("outputAuthorsPath"); outputAuthorsPath = parser.get("outputAuthorsPath");

View File

@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager {
long startDownload = 0; long startDownload = 0;
Configuration conf = initConfigurationObject(); Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf); FileSystem fs = initFileSystemObject(conf);
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName); String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
Path hdfsreadpath = new Path(lambdaFileUri); Path hdfsreadpath = new Path(lambdaFileUri);
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath); FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
Path hdfsoutputPath = new Path( Path hdfsoutputPath = new Path(
hdfsServerUri hdfsServerUri
.concat(hdfsOrcidDefaultPath) .concat(workingPath)
.concat(outputPath) .concat(outputPath)
.concat("orcid_records.seq")); .concat("orcid_records.seq"));
@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri); Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); workingPath = parser.get("workingPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath); Log.info("Default Path: " + workingPath);
lambdaFileName = parser.get("lambdaFileName"); lambdaFileName = parser.get("lambdaFileName");
Log.info("Lambda File Name: " + lambdaFileName); Log.info("Lambda File Name: " + lambdaFileName);
outputPath = parser.get("outputPath"); outputPath = parser.get("outputPath");

View File

@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
public class ActivitiesDumpReader { public class ActivitiesDumpReader {
private static final int MAX_XML_WORKS_PARSED = 100; private static final int MAX_XML_WORKS_PARSED = -1;
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10; private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
throws Exception { throws Exception {
@ -127,7 +127,7 @@ public class ActivitiesDumpReader {
Log Log
.warn( .warn(
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
Log.warn(e); // Log.warn(e);
} }

What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.

What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {

View File

@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
private String activitiesFileNameTarGz; private String activitiesFileNameTarGz;
private String outputWorksPath; private String outputWorksPath;
private String workingPath; // private String workingPath;
public static void main(String[] args) throws IOException, Exception { public static void main(String[] args) throws IOException, Exception {
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
Log.info("HDFS URI: " + hdfsServerUri); Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath"); workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath); Log.info("Working Path: " + workingPath);
hdfsOrcidDefaultPath = workingPath;
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz); Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputWorksPath = parser.get("outputWorksPath"); outputWorksPath = parser.get("outputWorksPath");

View File

@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -24,9 +25,11 @@ import com.google.gson.JsonElement;
import com.google.gson.JsonParser; import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import scala.Tuple2; import scala.Tuple2;
@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class); .sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark Dataset<AuthorData> summariesDataset = spark
.createDataset( .createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks {
.filter(Objects::nonNull) .filter(Objects::nonNull)
.toJavaRDD(); .toJavaRDD();
logger.info("Works enriched data created: " + enrichedWorksRDD.count()); logger.info("Works enriched data created: " + enrichedWorksRDD.count());
enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath); enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
logger.info("Works enriched data saved"); logger.info("Works enriched data saved");
JavaRDD<Tuple2<String, Publication>> oafPublicationRDD = enrichedWorksRDD.map(e -> {
JsonElement j = new JsonParser().parse(e._2());
return new Tuple2<>(e._1(), (Publication) PublicationToOaf
.generatePublicationActionsFromDump(j.getAsJsonObject()));
});
Dataset<Tuple2<String, Publication>> publicationDataset = spark
.createDataset(
oafPublicationRDD.repartition(1).rdd(),
Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class)));
publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
}); });
} }

View File

@ -172,7 +172,7 @@ public class PublicationToOaf {
instance.setUrl(urls); instance.setUrl(urls);
} }
final String pubDate = getPublicationDate(rootElement, "publication_date"); final String pubDate = getPublicationDate(rootElement, "publicationDates");
if (StringUtils.isNotBlank(pubDate)) { if (StringUtils.isNotBlank(pubDate)) {
instance.setDateofacceptance(mapStringField(pubDate, null)); instance.setDateofacceptance(mapStringField(pubDate, null));
} }
Review

Is the caller expecting the null? Otherwise this would likely produce a NPE.

Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
Review

yes, there is a check on null value

yes, there is a check on null value
@ -325,7 +325,12 @@ public class PublicationToOaf {
private static String getPublicationDate(final JsonObject rootElement, private static String getPublicationDate(final JsonObject rootElement,
final String jsonKey) { final String jsonKey) {
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); JsonObject pubDateJson = null;
try {
pubDateJson = rootElement.getAsJsonObject(jsonKey);
} catch (Exception e) {
Review

Is the caller expecting the null? Otherwise this would likely produce a NPE.

Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
Review

yes, there is a check on null value

yes, there is a check on null value
return null;
}
if (pubDateJson == null) { if (pubDateJson == null) {
return null; return null;
} }
Review

Is the caller expecting the null? Otherwise this would likely produce a NPE.

Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
Review

yes, there is a check on null value

yes, there is a check on null value

View File

@ -1,6 +1,6 @@
[ [
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true}, {"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} {"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
] ]

View File

@ -1,6 +1,6 @@
[ [
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} {"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
] ]

View File

@ -1,6 +1,6 @@
[ [
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true}, {"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true} {"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}

View File

@ -1,75 +1,9 @@
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath_activities</name> <name>workingPath</name>
<description>the working dir base path</description> <description>the working dir base path</description>
</property> </property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property>
<property>
<name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property>
<property>
<name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property>
<property>
<name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property>
<property>
<name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property>
<property>
<name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property>
<property>
<name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property>
<property>
<name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property>
<property>
<name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property>
<property>
<name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property>
<property>
<name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property>
</parameters> </parameters>
<start to="ResetWorkingPath"/> <start to="ResetWorkingPath"/>
@ -80,436 +14,11 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath_activities}/no_doi_works/*'/> <delete path='${workingPath}/no_doi_enriched_works/output'/>
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
</fs> </fs>
<ok to="fork_gen_orcid_author_work"/> <ok to="Gen_Enriched_Orcid_Works"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name = "fork_gen_orcid_author_work">
<path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/>
<path start = "check_exist_on_hdfs_activities_3"/>
<path start = "check_exist_on_hdfs_activities_4"/>
<path start = "check_exist_on_hdfs_activities_5"/>
<path start = "check_exist_on_hdfs_activities_6"/>
<path start = "check_exist_on_hdfs_activities_7"/>
<path start = "check_exist_on_hdfs_activities_8"/>
<path start = "check_exist_on_hdfs_activities_9"/>
<path start = "check_exist_on_hdfs_activities_X"/>
</fork>
<decision name="check_exist_on_hdfs_activities_0">
<switch>
<case to="GenOrcidAuthorWork_0">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
</case>
<default to="Download_0" />
</switch>
</decision>
<action name="Download_0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_0"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_1">
<switch>
<case to="GenOrcidAuthorWork_1">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
</case>
<default to="Download_1" />
</switch>
</decision>
<action name="Download_1">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_1}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_1"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_2">
<switch>
<case to="GenOrcidAuthorWork_2">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
</case>
<default to="Download_2" />
</switch>
</decision>
<action name="Download_2">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_2}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_2"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_3">
<switch>
<case to="GenOrcidAuthorWork_3">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
</case>
<default to="Download_3" />
</switch>
</decision>
<action name="Download_3">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_3}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_3"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_4">
<switch>
<case to="GenOrcidAuthorWork_4">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
</case>
<default to="Download_4" />
</switch>
</decision>
<action name="Download_4">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_4}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_4"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_5">
<switch>
<case to="GenOrcidAuthorWork_5">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
</case>
<default to="Download_5" />
</switch>
</decision>
<action name="Download_5">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_5}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_5"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_6">
<switch>
<case to="GenOrcidAuthorWork_6">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
</case>
<default to="Download_6" />
</switch>
</decision>
<action name="Download_6">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_6}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_6"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_7">
<switch>
<case to="GenOrcidAuthorWork_7">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
</case>
<default to="Download_7" />
</switch>
</decision>
<action name="Download_7">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_7}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_7"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_8">
<switch>
<case to="GenOrcidAuthorWork_8">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
</case>
<default to="Download_8" />
</switch>
</decision>
<action name="Download_8">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_8}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_8"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_9">
<switch>
<case to="GenOrcidAuthorWork_9">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
</case>
<default to="Download_9" />
</switch>
</decision>
<action name="Download_9">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_9}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_9"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_X">
<switch>
<case to="GenOrcidAuthorWork_X">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
</case>
<default to="Download_X" />
</switch>
</decision>
<action name="Download_X">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_X}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_X"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
<action name="Gen_Enriched_Orcid_Works"> <action name="Gen_Enriched_Orcid_Works">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">

View File

@ -1,9 +1,15 @@
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath</name> <name>workingPath</name>
<description>the working dir base path</description> <description>the working dir base path</description>
</property> </property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters> </parameters>
<start to="ResetWorkingPath"/> <start to="ResetWorkingPath"/>
@ -15,24 +21,44 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath}/output'/> <delete path='${workingPath}/summaries/output'/>
<mkdir path='${workingPath}/output'/> <mkdir path='${workingPath}/summaries/output'/>
</fs> </fs>
<ok to="ImportOrcidSummary"/> <ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="ImportOrcidSummaries">
<action name="ImportOrcidSummary">
<java> <java>
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class> <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-d</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg> <arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>output/</arg> <arg>-o</arg><arg>summaries/output/</arg>
</java> </java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -0,0 +1,31 @@
<configuration>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value>
</property>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,514 @@
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property>
<property>
<name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property>
<property>
<name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property>
<property>
<name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property>
<property>
<name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property>
<property>
<name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property>
<property>
<name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property>
<property>
<name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property>
<property>
<name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property>
<property>
<name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property>
<property>
<name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/no_doi_works/*'/>
</fs>
<ok to="fork_gen_orcid_author_work"/>
<error to="Kill"/>
</action>
<fork name = "fork_gen_orcid_author_work">
<path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/>
<path start = "check_exist_on_hdfs_activities_3"/>
<path start = "check_exist_on_hdfs_activities_4"/>
<path start = "check_exist_on_hdfs_activities_5"/>
<path start = "check_exist_on_hdfs_activities_6"/>
<path start = "check_exist_on_hdfs_activities_7"/>
<path start = "check_exist_on_hdfs_activities_8"/>
<path start = "check_exist_on_hdfs_activities_9"/>
<path start = "check_exist_on_hdfs_activities_X"/>
</fork>
<decision name="check_exist_on_hdfs_activities_0">
<switch>
<case to="GenOrcidAuthorWork_0">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
</case>
<default to="Download_0" />
</switch>
</decision>
<action name="Download_0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_0"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_1">
<switch>
<case to="GenOrcidAuthorWork_1">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
</case>
<default to="Download_1" />
</switch>
</decision>
<action name="Download_1">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_1}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_1"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_2">
<switch>
<case to="GenOrcidAuthorWork_2">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
</case>
<default to="Download_2" />
</switch>
</decision>
<action name="Download_2">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_2}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_2"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_3">
<switch>
<case to="GenOrcidAuthorWork_3">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
</case>
<default to="Download_3" />
</switch>
</decision>
<action name="Download_3">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_3}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_3"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_4">
<switch>
<case to="GenOrcidAuthorWork_4">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
</case>
<default to="Download_4" />
</switch>
</decision>
<action name="Download_4">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_4}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_4"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_5">
<switch>
<case to="GenOrcidAuthorWork_5">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
</case>
<default to="Download_5" />
</switch>
</decision>
<action name="Download_5">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_5}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_5"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_6">
<switch>
<case to="GenOrcidAuthorWork_6">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
</case>
<default to="Download_6" />
</switch>
</decision>
<action name="Download_6">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_6}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_6"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_7">
<switch>
<case to="GenOrcidAuthorWork_7">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
</case>
<default to="Download_7" />
</switch>
</decision>
<action name="Download_7">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_7}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_7"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_8">
<switch>
<case to="GenOrcidAuthorWork_8">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
</case>
<default to="Download_8" />
</switch>
</decision>
<action name="Download_8">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_8}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_8"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_9">
<switch>
<case to="GenOrcidAuthorWork_9">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
</case>
<default to="Download_9" />
</switch>
</decision>
<action name="Download_9">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_9}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_9"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_X">
<switch>
<case to="GenOrcidAuthorWork_X">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
</case>
<default to="Download_X" />
</switch>
</decision>
<action name="Download_X">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_X}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_X"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,22 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,68 @@
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/summaries/output'/>
<mkdir path='${workingPath}/summaries/output'/>
</fs>
<ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>summaries/output/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedReader; import java.io.*;
import java.io.IOException; import java.nio.file.Files;
import java.io.InputStreamReader;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Arrays; import java.util.Arrays;
@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class OrcidClientTest { public class OrcidClientTest {
final String orcidId = "0000-0001-7291-3210"; final String orcidId = "0000-0001-7291-3210";
@ -32,11 +32,20 @@ public class OrcidClientTest {
String lastUpdate = "2019-09-30 00:00:00"; String lastUpdate = "2019-09-30 00:00:00";
String shortDate = "2020-05-06 16:06:11"; String shortDate = "2020-05-06 16:06:11";
// curl -i -H "Accept: application/vnd.orcid+xml" // curl -i -H "Accept: application/vnd.orcid+xml"
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d' // -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record' // 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
public String testDownloadRecord(String orcidId) throws Exception { @Test
public void downloadTest() throws Exception {
String record = testDownloadRecord("0000-0002-2536-4498");
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
System.out.println("saved to tmp");
}
private String testDownloadRecord(String orcidId) throws Exception {
try (CloseableHttpClient client = HttpClients.createDefault()) { try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Accept", "application/vnd.orcid+xml");
@ -100,7 +109,7 @@ public class OrcidClientTest {
} }
// @Test // @Test
public void getRecordDatestamp() throws ParseException { private void getRecordDatestamp() throws ParseException {
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate); Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate); Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
@ -108,7 +117,7 @@ public class OrcidClientTest {
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt)); assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
} }
public void testDate(String value) throws ParseException { private void testDate(String value) throws ParseException {
System.out.println(value.toString()); System.out.println(value.toString());
if (value.length() != 19) { if (value.length() != 19) {
value = value.substring(0, 19); value = value.substring(0, 19);
@ -118,14 +127,16 @@ public class OrcidClientTest {
} }
// @Test // @Test
public void testModifiedDate() throws ParseException { @Ignore
private void testModifiedDate() throws ParseException {
testDate(toRetrieveDate); testDate(toRetrieveDate);
testDate(toNotRetrieveDate); testDate(toNotRetrieveDate);
testDate(shortDate); testDate(shortDate);
} }
// @Test // @Test
public void testReadBase64CompressedRecord() throws Exception { @Ignore
private void testReadBase64CompressedRecord() throws Exception {
final String base64CompressedRecord = IOUtils final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64")); .toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);

View File

@ -13,14 +13,15 @@ import com.google.gson.JsonParser;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class PublicationToOafTest { public class PublicationToOafTest {
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class); private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
@Test @Test
// @Ignore @Ignore
public void convertOafPublicationTest() throws Exception { private void convertOafPublicationTest() throws Exception {
String jsonPublication = IOUtils String jsonPublication = IOUtils
.toString( .toString(
PublicationToOafTest.class.getResourceAsStream("publication.json")); PublicationToOafTest.class.getResourceAsStream("publication.json"));

View File

@ -42,12 +42,12 @@ public class OrcidNoDoiTest {
@Test @Test
@Ignore @Ignore
private void readPublicationFieldsTest() public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ...."); logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils String xml = IOUtils
.toString( .toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
if (xml == null) { if (xml == null) {
logger.info("Resource not found"); logger.info("Resource not found");

View File

@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="63461376"
path="/0000-0002-2536-4498/work/63461376" visibility="public">
<common:created-date>2019-10-22T03:18:13.755Z</common:created-date>
<common:last-modified-date>2020-06-17T11:07:13.703Z</common:last-modified-date>
<common:source>
<common:source-client-id>
<common:uri>https://orcid.org/client/0000-0001-8607-8906</common:uri>
<common:path>0000-0001-8607-8906</common:path>
<common:host>orcid.org</common:host>
</common:source-client-id>
<common:source-name>INSPIRE-HEP</common:source-name>
</common:source>
<work:title>
<common:title>Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector</common:title>
</work:title>
<common:external-ids>
<common:external-id>
<common:external-id-type>other-id</common:external-id-type>
<common:external-id-value>1759875</common:external-id-value>
<common:external-id-normalized transient="true">1759875</common:external-id-normalized>
<common:external-id-url>http://inspirehep.net/record/1759875</common:external-id-url>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>doi</common:external-id-type>
<common:external-id-value>10.1140/epjc/s10052-020-7907-9</common:external-id-value>
<common:external-id-normalized transient="true">10.1140/epjc/s10052-020-7907-9</common:external-id-normalized>
<common:external-id-url>http://dx.doi.org/10.1140/epjc/s10052-020-7907-9</common:external-id-url>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>arxiv</common:external-id-type>
<common:external-id-value>1910.08819</common:external-id-value>
<common:external-id-normalized transient="true">arXiv:1910.08819</common:external-id-normalized>
<common:external-id-url>http://arxiv.org/abs/1910.08819</common:external-id-url>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://inspirehep.net/record/1759875</common:url>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2020</common:year>
<common:month>06</common:month>
<common:day>12</common:day>
</common:publication-date>
<work:journal-title>Eur.Phys.J.C</work:journal-title>
</work:work>