orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
20 changed files with 320 additions and 228 deletions
Showing only changes of commit 6bc7dbeca7 - Show all commits

View File

@ -51,7 +51,6 @@
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>${org.apache.httpcomponents.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
@ -87,7 +86,6 @@
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId> <artifactId>commons-text</artifactId>
<version>${common.text.version}</version>
</dependency> </dependency>

Versions of dependencies should be only declared in the main pom file. Please declare this dependency there (v1.8) and refer to it without overriding the version.

Versions of dependencies should be only declared in the main pom file. Please declare this dependency there (v1.8) and refer to it without overriding the version.

Please remove the dependency version at all from here. The version has to be declared only in the main pom file. Just like all the other dependencies in the same pom file.

Please remove the dependency version at all from here. The version has to be declared *only* in the main pom file. Just like all the other dependencies in the same pom file.

I still see the dependency version declared here. Please move in the project's main pom under the dependencyManagement section.

I still see the dependency version declared here. Please move in the project's main pom under the _dependencyManagement_ section.

View File

@ -62,7 +62,7 @@ public class OrcidDSManager {
.toString( .toString(
OrcidDSManager.class OrcidDSManager.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
parser.parseArgument(args); parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");

View File

@ -73,7 +73,7 @@ public class ActivitiesDumpReader {
SequenceFile.Writer.valueClass(Text.class))) { SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) { while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName(); String filename = entry.getName();
StringBuffer buffer = new StringBuffer();
try { try {
if (entry.isDirectory() || !filename.contains("works")) { if (entry.isDirectory() || !filename.contains("works")) {
@ -83,7 +83,7 @@ public class ActivitiesDumpReader {
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput // tarInput
String line; String line;
StringBuffer buffer = new StringBuffer(); buffer = new StringBuffer();
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
buffer.append(line); buffer.append(line);
} }

View File

@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
.toString( .toString(
GenOrcidAuthorWork.class GenOrcidAuthorWork.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
parser.parseArgument(args); parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri"); hdfsServerUri = parser.get("hdfsServerUri");

View File

@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class); .sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark Dataset<AuthorData> summariesDataset = spark
.createDataset( .createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks {
Encoders.tuple(Encoders.STRING(), Encoders.STRING())) Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.toJavaRDD(); .toJavaRDD();
enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); // enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
logger.info("Works enriched data saved"); logger.info("Enriched works RDD ready.");
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
final LongAccumulator enrichedPublications = spark final LongAccumulator enrichedPublications = spark
@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks {
.write() .write()
.format("parquet") .format("parquet")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(workingPath + "no_doi_dataset/output"); .save(workingPath + outputEnrichedWorksPath);
logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("parsedPublications: " + parsedPublications.value().toString());
logger.info("enrichedPublications: " + enrichedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString());

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -40,7 +41,7 @@ public class AuthorMatcher {
int matchCounter = 0; int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter); List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null; Contributor contributor = null;
contributors.forEach(c -> { contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
if (simpleMatch(c.getCreditName(), author.getName()) || if (simpleMatch(c.getCreditName(), author.getName()) ||
simpleMatch(c.getCreditName(), author.getSurname()) || simpleMatch(c.getCreditName(), author.getSurname()) ||
simpleMatch(c.getCreditName(), author.getOtherName())) { simpleMatch(c.getCreditName(), author.getOtherName())) {
@ -54,6 +55,7 @@ public class AuthorMatcher {
Optional<Contributor> optCon = contributors Optional<Contributor> optCon = contributors
.stream() .stream()
.filter(c -> c.isSimpleMatch()) .filter(c -> c.isSimpleMatch())
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> { .map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
return c; return c;

View File

@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi {
private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap) private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException { throws XPathParseException, NavException, XPathEvalException {
List<Contributor> contributors = new ArrayList<Contributor>(); List<Contributor> contributors = new ArrayList<Contributor>();
int nameIndex = 0; ap.selectXPath("//work:contributors/work:contributor");
ap.selectXPath("//work:contributor/work:credit-name");
while (ap.evalXPath() != -1) { while (ap.evalXPath() != -1) {
Contributor contributor = new Contributor(); Contributor contributor = new Contributor();
int t = vn.getText(); if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
if (t >= 0) { int val = vn.getText();
contributor.setCreditName(vn.toNormalizedString(t)); if (val != -1) {
contributors.add(nameIndex, contributor); contributor.setCreditName(vn.toNormalizedString(val));
nameIndex++; }
vn.toElement(VTDNav.PARENT);
} }
} if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
if (contributors.size() == 0) { if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
return contributors; int val = vn.getText();
} if (val != -1) {
contributor.setSequence(vn.toNormalizedString(val));
int sequenceIndex = 0; }
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); vn.toElement(VTDNav.PARENT);
while (ap.evalXPath() != -1) { }
int t = vn.getText(); if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
if (t >= 0) { int val = vn.getText();
contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t)); if (val != -1) {
sequenceIndex++; contributor.setRole(vn.toNormalizedString(val));
} }
} vn.toElement(VTDNav.PARENT);
}
int roleIndex = 0; vn.toElement(VTDNav.PARENT);
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
roleIndex++;
} }
contributors.add(contributor);
} }
return contributors; return contributors;
} }

View File

@ -0,0 +1,7 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
]

View File

@ -1,42 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://hadoop-edge2.garr-pa1.d4science.org:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-edge1.garr-pa1.d4science.org:18089/</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
</configuration>

View File

@ -1,67 +0,0 @@
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/summaries/output'/>
<mkdir path='${workingPath}/summaries/output'/>
</fs>
<ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>summaries/output/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -9,7 +9,7 @@
</property> </property>
<property> <property>
<name>oozie.launcher.mapreduce.map.java.opts</name> <name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value> <value>-Xmx2g</value>
</property> </property>
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>

View File

@ -1,4 +1,4 @@
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Gen Orcid Works-no-doi From Activities" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath</name> <name>workingPath</name>
@ -6,70 +6,70 @@
</property> </property>
<property> <property>
<name>shell_cmd_0</name> <name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description> <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property> </property>
<property> <property>
<name>shell_cmd_1</name> <name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description> <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property> </property>
<property> <property>
<name>shell_cmd_2</name> <name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description> <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property> </property>
<property> <property>
<name>shell_cmd_3</name> <name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description> <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property> </property>
<property> <property>
<name>shell_cmd_4</name> <name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description> <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property> </property>
<property> <property>
<name>shell_cmd_5</name> <name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description> <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property> </property>
<property> <property>
<name>shell_cmd_6</name> <name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description> <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property> </property>
<property> <property>
<name>shell_cmd_7</name> <name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description> <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property> </property>
<property> <property>
<name>shell_cmd_8</name> <name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description> <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property> </property>
<property> <property>
<name>shell_cmd_9</name> <name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description> <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property> </property>
<property> <property>
<name>shell_cmd_X</name> <name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz <value>wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description> <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property> </property>
</parameters> </parameters>
<start to="ResetWorkingPath"/> <start to="ResetWorkingPath"/>
@ -82,11 +82,11 @@
<fs> <fs>
<delete path='${workingPath}/no_doi_works/*'/> <delete path='${workingPath}/no_doi_works/*'/>
</fs> </fs>
<ok to="fork_gen_orcid_author_work"/> <ok to="fork_check_download_files"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name = "fork_gen_orcid_author_work"> <fork name = "fork_check_download_files">
<path start = "check_exist_on_hdfs_activities_0"/> <path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/> <path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/> <path start = "check_exist_on_hdfs_activities_2"/>
@ -102,8 +102,8 @@
<decision name="check_exist_on_hdfs_activities_0"> <decision name="check_exist_on_hdfs_activities_0">
<switch> <switch>
<case to="GenOrcidAuthorWork_0"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
</case> </case>
<default to="Download_0" /> <default to="Download_0" />
</switch> </switch>
@ -118,7 +118,7 @@
<argument>${shell_cmd_0}</argument> <argument>${shell_cmd_0}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_0"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -129,7 +129,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -139,8 +139,8 @@
<decision name="check_exist_on_hdfs_activities_1"> <decision name="check_exist_on_hdfs_activities_1">
<switch> <switch>
<case to="GenOrcidAuthorWork_1"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
</case> </case>
<default to="Download_1" /> <default to="Download_1" />
</switch> </switch>
@ -155,7 +155,7 @@
<argument>${shell_cmd_1}</argument> <argument>${shell_cmd_1}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_1"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -166,7 +166,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -176,8 +176,8 @@
<decision name="check_exist_on_hdfs_activities_2"> <decision name="check_exist_on_hdfs_activities_2">
<switch> <switch>
<case to="GenOrcidAuthorWork_2"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
</case> </case>
<default to="Download_2" /> <default to="Download_2" />
</switch> </switch>
@ -192,7 +192,7 @@
<argument>${shell_cmd_2}</argument> <argument>${shell_cmd_2}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_2"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -203,7 +203,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -213,8 +213,8 @@
<decision name="check_exist_on_hdfs_activities_3"> <decision name="check_exist_on_hdfs_activities_3">
<switch> <switch>
<case to="GenOrcidAuthorWork_3"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
</case> </case>
<default to="Download_3" /> <default to="Download_3" />
</switch> </switch>
@ -229,7 +229,7 @@
<argument>${shell_cmd_3}</argument> <argument>${shell_cmd_3}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_3"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -240,7 +240,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -250,8 +250,8 @@
<decision name="check_exist_on_hdfs_activities_4"> <decision name="check_exist_on_hdfs_activities_4">
<switch> <switch>
<case to="GenOrcidAuthorWork_4"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
</case> </case>
<default to="Download_4" /> <default to="Download_4" />
</switch> </switch>
@ -266,7 +266,7 @@
<argument>${shell_cmd_4}</argument> <argument>${shell_cmd_4}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_4"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -277,7 +277,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -287,8 +287,8 @@
<decision name="check_exist_on_hdfs_activities_5"> <decision name="check_exist_on_hdfs_activities_5">
<switch> <switch>
<case to="GenOrcidAuthorWork_5"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
</case> </case>
<default to="Download_5" /> <default to="Download_5" />
</switch> </switch>
@ -303,7 +303,7 @@
<argument>${shell_cmd_5}</argument> <argument>${shell_cmd_5}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_5"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -314,7 +314,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -324,8 +324,8 @@
<decision name="check_exist_on_hdfs_activities_6"> <decision name="check_exist_on_hdfs_activities_6">
<switch> <switch>
<case to="GenOrcidAuthorWork_6"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
</case> </case>
<default to="Download_6" /> <default to="Download_6" />
</switch> </switch>
@ -340,7 +340,7 @@
<argument>${shell_cmd_6}</argument> <argument>${shell_cmd_6}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_6"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -351,7 +351,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -362,8 +362,8 @@
<decision name="check_exist_on_hdfs_activities_7"> <decision name="check_exist_on_hdfs_activities_7">
<switch> <switch>
<case to="GenOrcidAuthorWork_7"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
</case> </case>
<default to="Download_7" /> <default to="Download_7" />
</switch> </switch>
@ -378,7 +378,7 @@
<argument>${shell_cmd_7}</argument> <argument>${shell_cmd_7}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_7"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -389,7 +389,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -399,8 +399,8 @@
<decision name="check_exist_on_hdfs_activities_8"> <decision name="check_exist_on_hdfs_activities_8">
<switch> <switch>
<case to="GenOrcidAuthorWork_8"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
</case> </case>
<default to="Download_8" /> <default to="Download_8" />
</switch> </switch>
@ -415,7 +415,7 @@
<argument>${shell_cmd_8}</argument> <argument>${shell_cmd_8}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_8"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -426,7 +426,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -436,8 +436,8 @@
<decision name="check_exist_on_hdfs_activities_9"> <decision name="check_exist_on_hdfs_activities_9">
<switch> <switch>
<case to="GenOrcidAuthorWork_9"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
</case> </case>
<default to="Download_9" /> <default to="Download_9" />
</switch> </switch>
@ -452,7 +452,7 @@
<argument>${shell_cmd_9}</argument> <argument>${shell_cmd_9}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_9"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -463,7 +463,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -473,8 +473,8 @@
<decision name="check_exist_on_hdfs_activities_X"> <decision name="check_exist_on_hdfs_activities_X">
<switch> <switch>
<case to="GenOrcidAuthorWork_X"> <case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
</case> </case>
<default to="Download_X" /> <default to="Download_X" />
</switch> </switch>
@ -489,7 +489,7 @@
<argument>${shell_cmd_X}</argument> <argument>${shell_cmd_X}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="GenOrcidAuthorWork_X"/> <ok to="wait_download_phase_node"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -500,7 +500,7 @@
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class> <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg> <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg> <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java> </java>
@ -508,7 +508,35 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name = "wait_download_phase_node" to = "fork_gen_orcid_author_work"/>
<fork name = "fork_gen_orcid_author_work">
<path start = "GenOrcidAuthorWork_0"/>
<path start = "GenOrcidAuthorWork_1"/>
<path start = "GenOrcidAuthorWork_2"/>
<path start = "GenOrcidAuthorWork_3"/>
<path start = "GenOrcidAuthorWork_4"/>
<path start = "GenOrcidAuthorWork_5"/>
<path start = "GenOrcidAuthorWork_6"/>
<path start = "GenOrcidAuthorWork_7"/>
<path start = "GenOrcidAuthorWork_8"/>
<path start = "GenOrcidAuthorWork_9"/>
<path start = "GenOrcidAuthorWork_X"/>
</fork>
<join name = "join_node" to = "End"/> <join name = "join_node" to = "End"/>
<!-- <join name = "join_node" to = "fork_gen_orcid_author_work_2"/>-->
<!-- <fork name = "fork_gen_orcid_author_work_2">-->
<!-- <path start = "GenOrcidAuthorWork_6"/>-->
<!-- <path start = "GenOrcidAuthorWork_7"/>-->
<!-- <path start = "GenOrcidAuthorWork_8"/>-->
<!-- <path start = "GenOrcidAuthorWork_9"/>-->
<!-- <path start = "GenOrcidAuthorWork_X"/>-->
<!-- </fork>-->
<!-- <join name = "join_node_2" to = "End"/>-->
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -19,4 +19,8 @@
<name>oozie.launcher.mapreduce.user.classpath.first</name> <name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value> <value>true</value>
</property> </property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx16g</value>
</property>
</configuration> </configuration>

View File

@ -1,4 +1,4 @@
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Gen Orcid Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath</name> <name>workingPath</name>
@ -6,7 +6,7 @@
</property> </property>
<property> <property>
<name>shell_cmd_0</name> <name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz <value>wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
</value> </value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description> <description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property> </property>
@ -21,8 +21,8 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath}/summaries/output'/> <delete path='${workingPath}/authors'/>
<mkdir path='${workingPath}/summaries/output'/> <mkdir path='${workingPath}/authors'/>
</fs> </fs>
<ok to="check_exist_on_hdfs_summaries"/> <ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/> <error to="Kill"/>
@ -31,7 +31,7 @@
<decision name="check_exist_on_hdfs_summaries"> <decision name="check_exist_on_hdfs_summaries">
<switch> <switch>
<case to="ImportOrcidSummaries"> <case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
</case> </case>
<default to="DownloadSummaries" /> <default to="DownloadSummaries" />
</switch> </switch>
@ -57,8 +57,8 @@
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class> <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg> <arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
<arg>-o</arg><arg>summaries/output/</arg> <arg>-o</arg><arg>authors/</arg>
</java> </java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -59,7 +59,7 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath}/no_doi_enriched_works/output'/> <delete path='${workingPath}/no_doi_dataset'/>
</fs> </fs>
<ok to="GenOrcidNoDoiDataset"/> <ok to="GenOrcidNoDoiDataset"/>
<error to="Kill"/> <error to="Kill"/>
@ -85,7 +85,7 @@
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg> <arg>-f</arg><arg>-</arg>
<arg>-ow</arg><arg>no_doi_works/</arg> <arg>-ow</arg><arg>no_doi_works/</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/output</arg> <arg>-oew</arg><arg>no_doi_dataset</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -38,8 +38,8 @@ public class OrcidClientTest {
@Test @Test
public void downloadTest() throws Exception { public void downloadTest() throws Exception {
String record = testDownloadRecord("0000-0002-2536-4498"); String record = testDownloadRecord("0000-0001-6163-2042");
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml"); File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml");
OutputStream outStream = new FileOutputStream(f); OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream); IOUtils.write(record.getBytes(), outStream);
System.out.println("saved to tmp"); System.out.println("saved to tmp");

View File

@ -2,15 +2,20 @@
package eu.dnetlib.doiboost.orcidnodoi.xml; package eu.dnetlib.doiboost.orcidnodoi.xml;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException; import java.io.IOException;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import javax.validation.constraints.AssertTrue;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mortbay.log.Log;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -41,7 +46,6 @@ public class OrcidNoDoiTest {
String orcidIdA = "0000-0003-2760-1191"; String orcidIdA = "0000-0003-2760-1191";
@Test @Test
// @Ignore
public void readPublicationFieldsTest() public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ...."); logger.info("running loadPublicationFieldsTest ....");
@ -95,8 +99,7 @@ public class OrcidNoDoiTest {
} }
@Test @Test
// @Ignore public void authorMatchTest() throws Exception {
private void authorMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ...."); logger.info("running authorSimpleMatchTest ....");
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
AuthorData author = new AuthorData(); AuthorData author = new AuthorData();
@ -121,9 +124,60 @@ public class OrcidNoDoiTest {
logger.error("parsing xml", e); logger.error("parsing xml", e);
} }
assertNotNull(workData); assertNotNull(workData);
Contributor a = workData.getContributors().get(0);
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
AuthorMatcher.match(author, workData.getContributors()); AuthorMatcher.match(author, workData.getContributors());
GsonBuilder builder = new GsonBuilder(); GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create(); Gson gson = builder.create();
logger.info(gson.toJson(workData)); logger.info(gson.toJson(workData));
assertTrue(workData.getContributors().size() == 6);
Contributor c = workData.getContributors().get(0);
assertTrue(c.getOid().equals("0000-0003-2760-1191"));
assertTrue(c.getName().equals("Khairy"));
assertTrue(c.getSurname().equals("Abdel Dayem"));
assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
}
@Test
public void readContributorsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData.getContributors());
assertTrue(workData.getContributors().size() == 5);
assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
} }
} }

View File

@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
path="/0000-0003-2760-1191/work/28776099" visibility="public">
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
<common:path>0000-0002-9157-3431</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Europe PubMed Central</common:source-name>
</common:source>
<work:title>
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
ST-Segment-Elevation Myocardial Infarction.</common:title>
</work:title>
<work:citation>
<work:citation-type>formatted-unspecified</work:citation-type>
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
</work:citation>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2016</common:year>
<common:month>11</common:month>
</common:publication-date>
<common:external-ids>
<common:external-id>
<common:external-id-type>pmid</common:external-id-type>
<common:external-id-value>27899851</common:external-id-value>
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>pmc</common:external-id-type>
<common:external-id-value>PMC5126442</common:external-id-value>
<common:external-id-normalized transient="true"
>PMC5126442</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
<work:contributors>
<work:contributor>
<work:contributor-attributes>
<work:contributor-sequence>seq0</work:contributor-sequence>
<work:contributor-role>role0</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>creditname1</work:credit-name>
</work:contributor>
<work:contributor>
<work:credit-name>creditname2</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>seq2</work:contributor-sequence>
<work:contributor-role></work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>creditname3</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence></work:contributor-sequence>
<work:contributor-role>role3</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name></work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>seq4</work:contributor-sequence>
<work:contributor-role>role4</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
</work:contributors>
</work:work>

12
pom.xml
View File

@ -458,6 +458,18 @@
<version>${jsonschemagenerator.version}</version> <version>${jsonschemagenerator.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>${common.text.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${org.apache.httpcomponents.version}</version>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>