diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index b81299cd1..624dd7b31 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -51,7 +51,6 @@
org.apache.httpcomponents
httpclient
- ${org.apache.httpcomponents.version}
eu.dnetlib.dhp
@@ -87,7 +86,6 @@
org.apache.commons
commons-text
- ${common.text.version}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
index b62ad370e..bf13db021 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@@ -62,7 +62,7 @@ public class OrcidDSManager {
.toString(
OrcidDSManager.class
.getResourceAsStream(
- "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
+ "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index c73e1efd1..c2cfafd87 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -73,7 +73,7 @@ public class ActivitiesDumpReader {
SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
-
+ StringBuffer buffer = new StringBuffer();
try {
if (entry.isDirectory() || !filename.contains("works")) {
@@ -83,7 +83,7 @@ public class ActivitiesDumpReader {
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput
String line;
- StringBuffer buffer = new StringBuffer();
+ buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
index d32e6d945..d3e9aeaef 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
- "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
+ "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 24f0f7a87..691ca3eee 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD summariesRDD = sc
- .sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
+ .sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
Dataset summariesDataset = spark
.createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks {
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.filter(Objects::nonNull)
.toJavaRDD();
- enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
- logger.info("Works enriched data saved");
+// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
+ logger.info("Enriched works RDD ready.");
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
final LongAccumulator enrichedPublications = spark
@@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks {
.write()
.format("parquet")
.mode(SaveMode.Overwrite)
- .save(workingPath + "no_doi_dataset/output");
+ .save(workingPath + outputEnrichedWorksPath);
logger.info("parsedPublications: " + parsedPublications.value().toString());
logger.info("enrichedPublications: " + enrichedPublications.value().toString());
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
index 88c84ee89..6a1468f4c 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -5,6 +5,7 @@ import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -40,7 +41,7 @@ public class AuthorMatcher {
int matchCounter = 0;
List matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null;
- contributors.forEach(c -> {
+ contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
if (simpleMatch(c.getCreditName(), author.getName()) ||
simpleMatch(c.getCreditName(), author.getSurname()) ||
simpleMatch(c.getCreditName(), author.getOtherName())) {
@@ -54,6 +55,7 @@ public class AuthorMatcher {
Optional optCon = contributors
.stream()
.filter(c -> c.isSimpleMatch())
+ .filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
return c;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
index c5c115551..f4b093402 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
@@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi {
private static List getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List contributors = new ArrayList();
- int nameIndex = 0;
- ap.selectXPath("//work:contributor/work:credit-name");
+ ap.selectXPath("//work:contributors/work:contributor");
while (ap.evalXPath() != -1) {
Contributor contributor = new Contributor();
- int t = vn.getText();
- if (t >= 0) {
- contributor.setCreditName(vn.toNormalizedString(t));
- contributors.add(nameIndex, contributor);
- nameIndex++;
+ if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
+ int val = vn.getText();
+ if (val != -1) {
+ contributor.setCreditName(vn.toNormalizedString(val));
+ }
+ vn.toElement(VTDNav.PARENT);
}
- }
- if (contributors.size() == 0) {
- return contributors;
- }
-
- int sequenceIndex = 0;
- ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
- while (ap.evalXPath() != -1) {
- int t = vn.getText();
- if (t >= 0) {
- contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t));
- sequenceIndex++;
- }
- }
-
- int roleIndex = 0;
- ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
- while (ap.evalXPath() != -1) {
- int t = vn.getText();
- if (t >= 0) {
- contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
- roleIndex++;
+ if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
+ if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
+ int val = vn.getText();
+ if (val != -1) {
+ contributor.setSequence(vn.toNormalizedString(val));
+ }
+ vn.toElement(VTDNav.PARENT);
+ }
+ if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
+ int val = vn.getText();
+ if (val != -1) {
+ contributor.setRole(vn.toNormalizedString(val));
+ }
+ vn.toElement(VTDNav.PARENT);
+ }
+ vn.toElement(VTDNav.PARENT);
}
+ contributors.add(contributor);
}
return contributors;
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json
similarity index 100%
rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json
rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json
new file mode 100644
index 000000000..c3a8f92ec
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json
@@ -0,0 +1,7 @@
+[
+ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
+ {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
+ {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
+ {"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
+ {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
deleted file mode 100644
index fe14bb8cb..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
- jobTracker
- hadoop-rm3.garr-pa1.d4science.org:8032
-
-
- nameNode
- hdfs://hadoop-rm1.garr-pa1.d4science.org:8020
-
-
- oozie.use.system.libpath
- true
-
-
- oozie.action.sharelib.for.spark
- spark2
-
-
- oozie.launcher.mapreduce.user.classpath.first
- true
-
-
- hive_metastore_uris
- thrift://hadoop-edge2.garr-pa1.d4science.org:9083
-
-
- spark2YarnHistoryServerAddress
- http://hadoop-edge1.garr-pa1.d4science.org:18089/
-
-
- spark2EventLogDir
- /user/spark/spark2ApplicationHistory
-
-
- spark2ExtraListeners
- "com.cloudera.spark.lineage.NavigatorAppListener"
-
-
- spark2SqlQueryExecutionListeners
- "com.cloudera.spark.lineage.NavigatorQueryListener"
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
deleted file mode 100644
index 51e00dc0f..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-
-
-
- workingPath
- the working dir base path
-
-
- shell_cmd_0
- wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
-
- the shell command that downloads and puts to hdfs orcid summaries
-
-
-
-
-
-
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
-
-
-
-
-
-
-
- ${jobTracker}
- ${nameNode}
- bash
- -c
- ${shell_cmd_0}
-
-
-
-
-
-
-
-
- ${jobTracker}
- ${nameNode}
- eu.dnetlib.doiboost.orcid.OrcidDSManager
- -w${workingPath}/
- -n${nameNode}
- -fORCID_2019_summaries.tar.gz
- -osummaries/output/
-
-
-
-
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
index 3068562d0..05fe6d014 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
@@ -9,7 +9,7 @@
oozie.launcher.mapreduce.map.java.opts
- -Xmx4g
+ -Xmx2g
jobTracker
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
index 8f9a5123e..ea4d33296 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
@@ -1,4 +1,4 @@
-
+
workingPath
@@ -6,70 +6,70 @@
shell_cmd_0
- wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 0
shell_cmd_1
- wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 1
shell_cmd_2
- wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 2
shell_cmd_3
- wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 3
-
+
shell_cmd_4
- wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 4
-
+
shell_cmd_5
- wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 5
-
+
shell_cmd_6
- wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 6
shell_cmd_7
- wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 7
shell_cmd_8
- wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 8
shell_cmd_9
- wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
the shell command that downloads and puts to hdfs orcid activity file 9
-
+
shell_cmd_X
- wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
+ wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
the shell command that downloads and puts to hdfs orcid activity file X
-
+
@@ -82,11 +82,11 @@
-
+
-
+
@@ -102,8 +102,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
@@ -118,7 +118,7 @@
${shell_cmd_0}
-
+
@@ -129,7 +129,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_0.tar.gz
+ -fORCID_2020_10_activites_0.tar.gz
-owno_doi_works/works_0.seq
-oewno_doi_enriched_works/
@@ -139,8 +139,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
@@ -155,7 +155,7 @@
${shell_cmd_1}
-
+
@@ -166,7 +166,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_1.tar.gz
+ -fORCID_2020_10_activites_1.tar.gz
-owno_doi_works/works_1.seq
-oewno_doi_enriched_works/
@@ -176,8 +176,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
@@ -192,7 +192,7 @@
${shell_cmd_2}
-
+
@@ -203,7 +203,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_2.tar.gz
+ -fORCID_2020_10_activites_2.tar.gz
-owno_doi_works/works_2.seq
-oewno_doi_enriched_works/
@@ -213,8 +213,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
@@ -229,7 +229,7 @@
${shell_cmd_3}
-
+
@@ -240,7 +240,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_3.tar.gz
+ -fORCID_2020_10_activites_3.tar.gz
-owno_doi_works/works_3.seq
-oewno_doi_enriched_works/
@@ -250,8 +250,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
@@ -266,7 +266,7 @@
${shell_cmd_4}
-
+
@@ -277,7 +277,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_4.tar.gz
+ -fORCID_2020_10_activites_4.tar.gz
-owno_doi_works/works_4.seq
-oewno_doi_enriched_works/
@@ -287,8 +287,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
@@ -303,7 +303,7 @@
${shell_cmd_5}
-
+
@@ -314,7 +314,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_5.tar.gz
+ -fORCID_2020_10_activites_5.tar.gz
-owno_doi_works/works_5.seq
-oewno_doi_enriched_works/
@@ -324,8 +324,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
@@ -340,7 +340,7 @@
${shell_cmd_6}
-
+
@@ -351,7 +351,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_6.tar.gz
+ -fORCID_2020_10_activites_6.tar.gz
-owno_doi_works/works_6.seq
-oewno_doi_enriched_works/
@@ -362,8 +362,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
@@ -378,7 +378,7 @@
${shell_cmd_7}
-
+
@@ -389,7 +389,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_7.tar.gz
+ -fORCID_2020_10_activites_7.tar.gz
-owno_doi_works/works_7.seq
-oewno_doi_enriched_works/
@@ -399,8 +399,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
@@ -415,7 +415,7 @@
${shell_cmd_8}
-
+
@@ -426,7 +426,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_8.tar.gz
+ -fORCID_2020_10_activites_8.tar.gz
-owno_doi_works/works_8.seq
-oewno_doi_enriched_works/
@@ -436,8 +436,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
@@ -452,7 +452,7 @@
${shell_cmd_9}
-
+
@@ -463,7 +463,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_9.tar.gz
+ -fORCID_2020_10_activites_9.tar.gz
-owno_doi_works/works_9.seq
-oewno_doi_enriched_works/
@@ -473,8 +473,8 @@
-
- ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
+
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
@@ -489,7 +489,7 @@
${shell_cmd_X}
-
+
@@ -500,7 +500,7 @@
eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_activites_X.tar.gz
+ -fORCID_2020_10_activites_X.tar.gz
-owno_doi_works/works_X.seq
-oewno_doi_enriched_works/
@@ -508,7 +508,35 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
index e77dd09c9..e1829e847 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
@@ -19,4 +19,8 @@
oozie.launcher.mapreduce.user.classpath.first
true
+
+ oozie.launcher.mapreduce.map.java.opts
+ -Xmx16g
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
index 3362cc67b..8517f35ee 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
@@ -1,4 +1,4 @@
-
+
workingPath
@@ -6,7 +6,7 @@
shell_cmd_0
- wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
+ wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
the shell command that downloads and puts to hdfs orcid summaries
@@ -21,8 +21,8 @@
-
-
+
+
@@ -31,7 +31,7 @@
- ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
+ ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
@@ -57,8 +57,8 @@
eu.dnetlib.doiboost.orcid.OrcidDSManager
-w${workingPath}/
-n${nameNode}
- -fORCID_2019_summaries.tar.gz
- -osummaries/output/
+ -fORCID_2020_10_summaries.tar.gz
+ -oauthors/
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
similarity index 95%
rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
index faed3104a..6cec48a6d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
@@ -59,7 +59,7 @@
-
+
@@ -85,7 +85,7 @@
-n${nameNode}
-f-
-owno_doi_works/
- -oewno_doi_enriched_works/output
+ -oewno_doi_dataset
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index 5e0f91ecd..774475626 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -38,8 +38,8 @@ public class OrcidClientTest {
@Test
public void downloadTest() throws Exception {
- String record = testDownloadRecord("0000-0002-2536-4498");
- File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
+ String record = testDownloadRecord("0000-0001-6163-2042");
+ File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml");
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
System.out.println("saved to tmp");
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index bf5aba99b..fa2980ac4 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -2,15 +2,20 @@
package eu.dnetlib.doiboost.orcidnodoi.xml;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
+import javax.validation.constraints.AssertTrue;
+
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.junit.jupiter.api.Test;
+import org.mortbay.log.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -41,7 +46,6 @@ public class OrcidNoDoiTest {
String orcidIdA = "0000-0003-2760-1191";
@Test
-// @Ignore
public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
@@ -95,8 +99,7 @@ public class OrcidNoDoiTest {
}
@Test
-// @Ignore
- private void authorMatchTest() throws Exception {
+ public void authorMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ....");
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
AuthorData author = new AuthorData();
@@ -121,9 +124,60 @@ public class OrcidNoDoiTest {
logger.error("parsing xml", e);
}
assertNotNull(workData);
+
+ Contributor a = workData.getContributors().get(0);
+ assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
+
AuthorMatcher.match(author, workData.getContributors());
GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create();
logger.info(gson.toJson(workData));
+
+ assertTrue(workData.getContributors().size() == 6);
+ Contributor c = workData.getContributors().get(0);
+ assertTrue(c.getOid().equals("0000-0003-2760-1191"));
+ assertTrue(c.getName().equals("Khairy"));
+ assertTrue(c.getSurname().equals("Abdel Dayem"));
+ assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
+ }
+
+ @Test
+ public void readContributorsTest()
+ throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+ logger.info("running loadPublicationFieldsTest ....");
+ String xml = IOUtils
+ .toString(
+ OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
+
+ if (xml == null) {
+ logger.info("Resource not found");
+ }
+ XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+ if (p == null) {
+ logger.info("XMLRecordParserNoDoi null");
+ }
+ WorkDataNoDoi workData = null;
+ try {
+ workData = p.VTDParseWorkData(xml.getBytes());
+ } catch (Exception e) {
+ logger.error("parsing xml", e);
+ }
+ assertNotNull(workData.getContributors());
+ assertTrue(workData.getContributors().size() == 5);
+ assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
+ assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
+ assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
+ assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
+ assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
+ assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
+ assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
+ assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
+ assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
+ assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
+ assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
+ assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
+ assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
+ assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
+ assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
}
}
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml
new file mode 100644
index 000000000..26e64aeda
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml
@@ -0,0 +1,101 @@
+
+
+ 2016-12-12T23:02:05.233Z
+ 2016-12-13T09:08:16.412Z
+
+
+ https://orcid.org/0000-0002-9157-3431
+ 0000-0002-9157-3431
+ orcid.org
+
+ Europe PubMed Central
+
+
+ Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+ Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+ ST-Segment-Elevation Myocardial Infarction.
+
+
+ formatted-unspecified
+ Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+ Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016
+
+ journal-article
+
+ 2016
+ 11
+
+
+
+ pmid
+ 27899851
+ 27899851
+ self
+
+
+ pmc
+ PMC5126442
+ PMC5126442
+ self
+
+
+ http://europepmc.org/abstract/med/27899851
+
+
+
+ seq0
+ role0
+
+
+
+ creditname1
+
+
+ creditname2
+
+ seq2
+
+
+
+
+ creditname3
+
+
+ role3
+
+
+
+
+
+ seq4
+ role4
+
+
+
+
diff --git a/pom.xml b/pom.xml
index d64de01ac..3629e2f1b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -458,6 +458,18 @@
${jsonschemagenerator.version}
+
+ org.apache.commons
+ commons-text
+ ${common.text.version}
+
+
+
+ org.apache.httpcomponents
+ httpclient
+ ${org.apache.httpcomponents.version}
+
+