first version of dataset successful generated from orcid dump 2020

This commit is contained in:
Enrico Ottonello 2020-11-06 13:47:50 +01:00
parent 9818e74a70
commit 6bc7dbeca7
20 changed files with 320 additions and 228 deletions

View File

@ -51,7 +51,6 @@
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>${org.apache.httpcomponents.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
@ -87,7 +86,6 @@
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId> <artifactId>commons-text</artifactId>
<version>${common.text.version}</version>
</dependency> </dependency>

View File

@ -62,7 +62,7 @@ public class OrcidDSManager {
.toString( .toString(
OrcidDSManager.class OrcidDSManager.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
parser.parseArgument(args); parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");

View File

@ -73,7 +73,7 @@ public class ActivitiesDumpReader {
SequenceFile.Writer.valueClass(Text.class))) { SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) { while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName(); String filename = entry.getName();
StringBuffer buffer = new StringBuffer();
try { try {
if (entry.isDirectory() || !filename.contains("works")) { if (entry.isDirectory() || !filename.contains("works")) {
@ -83,7 +83,7 @@ public class ActivitiesDumpReader {
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput // tarInput
String line; String line;
StringBuffer buffer = new StringBuffer(); buffer = new StringBuffer();
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
buffer.append(line); buffer.append(line);
} }

View File

@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
.toString( .toString(
GenOrcidAuthorWork.class GenOrcidAuthorWork.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
parser.parseArgument(args); parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");

View File

@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class); .sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark Dataset<AuthorData> summariesDataset = spark
.createDataset( .createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks {
Encoders.tuple(Encoders.STRING(), Encoders.STRING())) Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.toJavaRDD(); .toJavaRDD();
enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); // enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
logger.info("Works enriched data saved"); logger.info("Enriched works RDD ready.");
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
final LongAccumulator enrichedPublications = spark final LongAccumulator enrichedPublications = spark
@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks {
.write() .write()
.format("parquet") .format("parquet")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(workingPath + "no_doi_dataset/output"); .save(workingPath + outputEnrichedWorksPath);
logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("parsedPublications: " + parsedPublications.value().toString());
logger.info("enrichedPublications: " + enrichedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString());

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -40,7 +41,7 @@ public class AuthorMatcher {
int matchCounter = 0; int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter); List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null; Contributor contributor = null;
contributors.forEach(c -> { contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
if (simpleMatch(c.getCreditName(), author.getName()) || if (simpleMatch(c.getCreditName(), author.getName()) ||
simpleMatch(c.getCreditName(), author.getSurname()) || simpleMatch(c.getCreditName(), author.getSurname()) ||
simpleMatch(c.getCreditName(), author.getOtherName())) { simpleMatch(c.getCreditName(), author.getOtherName())) {
@ -54,6 +55,7 @@ public class AuthorMatcher {
Optional<Contributor> optCon = contributors Optional<Contributor> optCon = contributors
.stream() .stream()
.filter(c -> c.isSimpleMatch()) .filter(c -> c.isSimpleMatch())
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> { .map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
return c; return c;

View File

@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi {
private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap) private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException { throws XPathParseException, NavException, XPathEvalException {
List<Contributor> contributors = new ArrayList<Contributor>(); List<Contributor> contributors = new ArrayList<Contributor>();
int nameIndex = 0; ap.selectXPath("//work:contributors/work:contributor");
ap.selectXPath("//work:contributor/work:credit-name");
while (ap.evalXPath() != -1) { while (ap.evalXPath() != -1) {
Contributor contributor = new Contributor(); Contributor contributor = new Contributor();
int t = vn.getText(); if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
if (t >= 0) { int val = vn.getText();
contributor.setCreditName(vn.toNormalizedString(t)); if (val != -1) {
contributors.add(nameIndex, contributor); contributor.setCreditName(vn.toNormalizedString(val));
nameIndex++; }
vn.toElement(VTDNav.PARENT);
} }
} if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
if (contributors.size() == 0) { if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
return contributors; int val = vn.getText();
} if (val != -1) {
contributor.setSequence(vn.toNormalizedString(val));
int sequenceIndex = 0; }
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); vn.toElement(VTDNav.PARENT);
while (ap.evalXPath() != -1) { }
int t = vn.getText(); if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
if (t >= 0) { int val = vn.getText();
contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t)); if (val != -1) {
sequenceIndex++; contributor.setRole(vn.toNormalizedString(val));
} }
} vn.toElement(VTDNav.PARENT);
}
int roleIndex = 0; vn.toElement(VTDNav.PARENT);
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
roleIndex++;
} }
contributors.add(contributor);
} }
return contributors; return contributors;
} }

View File

@ -0,0 +1,7 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
]

View File

@ -1,42 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://hadoop-edge2.garr-pa1.d4science.org:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-edge1.garr-pa1.d4science.org:18089/</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
</configuration>

View File

@ -1,67 +0,0 @@
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/summaries/output'/>
<mkdir path='${workingPath}/summaries/output'/>
</fs>
<ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>summaries/output/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -9,7 +9,7 @@
</property> </property>
<property> <property>
<name>oozie.launcher.mapreduce.map.java.opts</name> <name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value> <value>-Xmx2g</value>
</property> </property>
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>

View File

@ -1,4 +1,4 @@
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Gen Orcid Works-no-doi From Activities" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath</name> <name>workingPath</name>
@ -6,67 +6,67 @@
</property> </property>
<property> <property>
<name>shell_cmd_0</name> <name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description> <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property> </property>
<property> <property>
<name>shell_cmd_1</name> <name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description> <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property> </property>
<property> <property>
<name>shell_cmd_2</name> <name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description> <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property> </property>
<property> <property>
<name>shell_cmd_3</name> <name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description> <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property> </property>
<property> <property>
<name>shell_cmd_4</name> <name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description> <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property> </property>
<property> <property>
<name>shell_cmd_5</name> <name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description> <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property> </property>
<property> <property>
<name>shell_cmd_6</name> <name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description> <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property> </property>
<property> <property>
<name>shell_cmd_7</name> <name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description> <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property> </property>
<property> <property>
<name>shell_cmd_8</name> <name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description> <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property> </property>
<property> <property>
<name>shell_cmd_9</name> <name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description> <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property> </property>
<property> <property>
<name>shell_cmd_X</name> <name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description> <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property> </property>
@ -82,11 +82,11 @@
<fs> <fs>
<delete path='${workingPath}/no_doi_works/*'/> <delete path='${workingPath}/no_doi_works/*'/>
</fs> </fs>
<ok to="fork_gen_orcid_author_work"/> <ok to="fork_check_download_files"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name = "fork_gen_orcid_author_work"> <fork name = "fork_check_download_files">
<path start = "check_exist_on_hdfs_activities_0"/> <path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/> <path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/> <path start = "check_exist_on_hdfs_activities_2"/>
@ -102,8 +102,8 @@
<decision name="check_exist_on_hdfs_activities_0"> <decision name="check_exist_on_hdfs_activities_0">
<switch> <switch>
<case to="GenOrcidAuthorWork_0"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
</case> </case>
<default to="Download_0" /> <default to="Download_0" />
</switch> </switch>
@ -118,7 +118,7 @@
<argument>${shell_cmd_0}</argument> <argument>${shell_cmd_0}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_0"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -129,7 +129,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -139,8 +139,8 @@
<decision name="check_exist_on_hdfs_activities_1"> <decision name="check_exist_on_hdfs_activities_1">
<switch> <switch>
<case to="GenOrcidAuthorWork_1"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
</case> </case>
<default to="Download_1" /> <default to="Download_1" />
</switch> </switch>
@ -155,7 +155,7 @@
<argument>${shell_cmd_1}</argument> <argument>${shell_cmd_1}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_1"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -166,7 +166,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -176,8 +176,8 @@
<decision name="check_exist_on_hdfs_activities_2"> <decision name="check_exist_on_hdfs_activities_2">
<switch> <switch>
<case to="GenOrcidAuthorWork_2"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
</case> </case>
<default to="Download_2" /> <default to="Download_2" />
</switch> </switch>
@ -192,7 +192,7 @@
<argument>${shell_cmd_2}</argument> <argument>${shell_cmd_2}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_2"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -203,7 +203,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -213,8 +213,8 @@
<decision name="check_exist_on_hdfs_activities_3"> <decision name="check_exist_on_hdfs_activities_3">
<switch> <switch>
<case to="GenOrcidAuthorWork_3"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
</case> </case>
<default to="Download_3" /> <default to="Download_3" />
</switch> </switch>
@ -229,7 +229,7 @@
<argument>${shell_cmd_3}</argument> <argument>${shell_cmd_3}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_3"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -240,7 +240,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -250,8 +250,8 @@
<decision name="check_exist_on_hdfs_activities_4"> <decision name="check_exist_on_hdfs_activities_4">
<switch> <switch>
<case to="GenOrcidAuthorWork_4"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
</case> </case>
<default to="Download_4" /> <default to="Download_4" />
</switch> </switch>
@ -266,7 +266,7 @@
<argument>${shell_cmd_4}</argument> <argument>${shell_cmd_4}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_4"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -277,7 +277,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -287,8 +287,8 @@
<decision name="check_exist_on_hdfs_activities_5"> <decision name="check_exist_on_hdfs_activities_5">
<switch> <switch>
<case to="GenOrcidAuthorWork_5"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
</case> </case>
<default to="Download_5" /> <default to="Download_5" />
</switch> </switch>
@ -303,7 +303,7 @@
<argument>${shell_cmd_5}</argument> <argument>${shell_cmd_5}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_5"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -314,7 +314,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -324,8 +324,8 @@
<decision name="check_exist_on_hdfs_activities_6"> <decision name="check_exist_on_hdfs_activities_6">
<switch> <switch>
<case to="GenOrcidAuthorWork_6"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
</case> </case>
<default to="Download_6" /> <default to="Download_6" />
</switch> </switch>
@ -340,7 +340,7 @@
<argument>${shell_cmd_6}</argument> <argument>${shell_cmd_6}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_6"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -351,7 +351,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -362,8 +362,8 @@
<decision name="check_exist_on_hdfs_activities_7"> <decision name="check_exist_on_hdfs_activities_7">
<switch> <switch>
<case to="GenOrcidAuthorWork_7"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
</case> </case>
<default to="Download_7" /> <default to="Download_7" />
</switch> </switch>
@ -378,7 +378,7 @@
<argument>${shell_cmd_7}</argument> <argument>${shell_cmd_7}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_7"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -389,7 +389,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -399,8 +399,8 @@
<decision name="check_exist_on_hdfs_activities_8"> <decision name="check_exist_on_hdfs_activities_8">
<switch> <switch>
<case to="GenOrcidAuthorWork_8"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
</case> </case>
<default to="Download_8" /> <default to="Download_8" />
</switch> </switch>
@ -415,7 +415,7 @@
<argument>${shell_cmd_8}</argument> <argument>${shell_cmd_8}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_8"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -426,7 +426,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -436,8 +436,8 @@
<decision name="check_exist_on_hdfs_activities_9"> <decision name="check_exist_on_hdfs_activities_9">
<switch> <switch>
<case to="GenOrcidAuthorWork_9"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
</case> </case>
<default to="Download_9" /> <default to="Download_9" />
</switch> </switch>
@ -452,7 +452,7 @@
<argument>${shell_cmd_9}</argument> <argument>${shell_cmd_9}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_9"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -463,7 +463,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -473,8 +473,8 @@
<decision name="check_exist_on_hdfs_activities_X"> <decision name="check_exist_on_hdfs_activities_X">
<switch> <switch>
<case to="GenOrcidAuthorWork_X"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
</case> </case>
<default to="Download_X" /> <default to="Download_X" />
</switch> </switch>
@ -489,7 +489,7 @@
<argument>${shell_cmd_X}</argument> <argument>${shell_cmd_X}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_X"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -500,7 +500,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -508,7 +508,35 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name = "wait_download_phase_node" to = "fork_gen_orcid_author_work"/>
<fork name = "fork_gen_orcid_author_work">
<path start = "GenOrcidAuthorWork_0"/>
<path start = "GenOrcidAuthorWork_1"/>
<path start = "GenOrcidAuthorWork_2"/>
<path start = "GenOrcidAuthorWork_3"/>
<path start = "GenOrcidAuthorWork_4"/>
<path start = "GenOrcidAuthorWork_5"/>
<path start = "GenOrcidAuthorWork_6"/>
<path start = "GenOrcidAuthorWork_7"/>
<path start = "GenOrcidAuthorWork_8"/>
<path start = "GenOrcidAuthorWork_9"/>
<path start = "GenOrcidAuthorWork_X"/>
</fork>
<join name = "join_node" to = "End"/> <join name = "join_node" to = "End"/>
<!-- <join name = "join_node" to = "fork_gen_orcid_author_work_2"/>-->
<!-- <fork name = "fork_gen_orcid_author_work_2">-->
<!-- <path start = "GenOrcidAuthorWork_6"/>-->
<!-- <path start = "GenOrcidAuthorWork_7"/>-->
<!-- <path start = "GenOrcidAuthorWork_8"/>-->
<!-- <path start = "GenOrcidAuthorWork_9"/>-->
<!-- <path start = "GenOrcidAuthorWork_X"/>-->
<!-- </fork>-->
<!-- <join name = "join_node_2" to = "End"/>-->
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -19,4 +19,8 @@
<name>oozie.launcher.mapreduce.user.classpath.first</name> <name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value> <value>true</value>
</property> </property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx16g</value>
</property>
</configuration> </configuration>

View File

@ -1,4 +1,4 @@
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Gen Orcid Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath</name> <name>workingPath</name>
@ -6,7 +6,7 @@
</property> </property>
<property> <property>
<name>shell_cmd_0</name> <name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz <value>wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description> <description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property> </property>
@ -21,8 +21,8 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath}/summaries/output'/> <delete path='${workingPath}/authors'/>
<mkdir path='${workingPath}/summaries/output'/> <mkdir path='${workingPath}/authors'/>
</fs> </fs>
<ok to="check_exist_on_hdfs_summaries"/> <ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/> <error to="Kill"/>
@ -31,7 +31,7 @@
<decision name="check_exist_on_hdfs_summaries"> <decision name="check_exist_on_hdfs_summaries">
<switch> <switch>
<case to="ImportOrcidSummaries"> <case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
</case> </case>
<default to="DownloadSummaries" /> <default to="DownloadSummaries" />
</switch> </switch>
@ -57,8 +57,8 @@
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class> <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
<arg>-o</arg><arg>summaries/output/</arg> <arg>-o</arg><arg>authors/</arg>
</java> </java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -59,7 +59,7 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath}/no_doi_enriched_works/output'/> <delete path='${workingPath}/no_doi_dataset'/>
</fs> </fs>
<ok to="GenOrcidNoDoiDataset"/> <ok to="GenOrcidNoDoiDataset"/>
<error to="Kill"/> <error to="Kill"/>
@ -85,7 +85,7 @@
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg> <arg>-f</arg><arg>-</arg>
<arg>-ow</arg><arg>no_doi_works/</arg> <arg>-ow</arg><arg>no_doi_works/</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/output</arg> <arg>-oew</arg><arg>no_doi_dataset</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -38,8 +38,8 @@ public class OrcidClientTest {
@Test @Test
public void downloadTest() throws Exception { public void downloadTest() throws Exception {
String record = testDownloadRecord("0000-0002-2536-4498"); String record = testDownloadRecord("0000-0001-6163-2042");
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml"); File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml");
OutputStream outStream = new FileOutputStream(f); OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream); IOUtils.write(record.getBytes(), outStream);
System.out.println("saved to tmp"); System.out.println("saved to tmp");

View File

@ -2,15 +2,20 @@
package eu.dnetlib.doiboost.orcidnodoi.xml; package eu.dnetlib.doiboost.orcidnodoi.xml;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException; import java.io.IOException;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import javax.validation.constraints.AssertTrue;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mortbay.log.Log;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -41,7 +46,6 @@ public class OrcidNoDoiTest {
String orcidIdA = "0000-0003-2760-1191"; String orcidIdA = "0000-0003-2760-1191";
@Test @Test
// @Ignore
public void readPublicationFieldsTest() public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ...."); logger.info("running loadPublicationFieldsTest ....");
@ -95,8 +99,7 @@ public class OrcidNoDoiTest {
} }
@Test @Test
// @Ignore public void authorMatchTest() throws Exception {
private void authorMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ...."); logger.info("running authorSimpleMatchTest ....");
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
AuthorData author = new AuthorData(); AuthorData author = new AuthorData();
@ -121,9 +124,60 @@ public class OrcidNoDoiTest {
logger.error("parsing xml", e); logger.error("parsing xml", e);
} }
assertNotNull(workData); assertNotNull(workData);
Contributor a = workData.getContributors().get(0);
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
AuthorMatcher.match(author, workData.getContributors()); AuthorMatcher.match(author, workData.getContributors());
GsonBuilder builder = new GsonBuilder(); GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create(); Gson gson = builder.create();
logger.info(gson.toJson(workData)); logger.info(gson.toJson(workData));
assertTrue(workData.getContributors().size() == 6);
Contributor c = workData.getContributors().get(0);
assertTrue(c.getOid().equals("0000-0003-2760-1191"));
assertTrue(c.getName().equals("Khairy"));
assertTrue(c.getSurname().equals("Abdel Dayem"));
assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
}
@Test
public void readContributorsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData.getContributors());
assertTrue(workData.getContributors().size() == 5);
assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
} }
} }

View File

@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
path="/0000-0003-2760-1191/work/28776099" visibility="public">
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
<common:path>0000-0002-9157-3431</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Europe PubMed Central</common:source-name>
</common:source>
<work:title>
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
ST-Segment-Elevation Myocardial Infarction.</common:title>
</work:title>
<work:citation>
<work:citation-type>formatted-unspecified</work:citation-type>
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
</work:citation>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2016</common:year>
<common:month>11</common:month>
</common:publication-date>
<common:external-ids>
<common:external-id>
<common:external-id-type>pmid</common:external-id-type>
<common:external-id-value>27899851</common:external-id-value>
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>pmc</common:external-id-type>
<common:external-id-value>PMC5126442</common:external-id-value>
<common:external-id-normalized transient="true"
>PMC5126442</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
<work:contributors>
<work:contributor>
<work:contributor-attributes>
<work:contributor-sequence>seq0</work:contributor-sequence>
<work:contributor-role>role0</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>creditname1</work:credit-name>
</work:contributor>
<work:contributor>
<work:credit-name>creditname2</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>seq2</work:contributor-sequence>
<work:contributor-role></work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>creditname3</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence></work:contributor-sequence>
<work:contributor-role>role3</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name></work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>seq4</work:contributor-sequence>
<work:contributor-role>role4</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
</work:contributors>
</work:work>

12
pom.xml
View File

@ -458,6 +458,18 @@
<version>${jsonschemagenerator.version}</version> <version>${jsonschemagenerator.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>${common.text.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${org.apache.httpcomponents.version}</version>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>