From 6bc7dbeca76e94f6cb00725aa50753d61d122952 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 6 Nov 2020 13:47:50 +0100 Subject: [PATCH] first version of dataset successful generated from orcid dump 2020 --- dhp-workflows/dhp-doiboost/pom.xml | 2 - .../doiboost/orcid/OrcidDSManager.java | 2 +- .../orcidnodoi/ActivitiesDumpReader.java | 4 +- .../orcidnodoi/GenOrcidAuthorWork.java | 2 +- .../SparkGenEnrichedOrcidWorks.java | 8 +- .../orcidnodoi/similarity/AuthorMatcher.java | 4 +- .../orcidnodoi/xml/XMLRecordParserNoDoi.java | 53 +++--- ... => gen_orcid_authors_from_summaries.json} | 0 ...en_orcid_works-no-doi_from_activities.json | 7 + .../orcid/oozie_app/config-default.xml | 42 ----- .../dhp/doiboost/orcid/oozie_app/workflow.xml | 67 -------- .../oozie_app/config-default.xml | 2 +- .../orcid_activities/oozie_app/workflow.xml | 156 +++++++++++------- .../oozie_app/config-default.xml | 4 + .../orcid_summaries/oozie_app/workflow.xml | 14 +- .../oozie_app/workflow.xml | 4 +- .../doiboost/orcid/OrcidClientTest.java | 4 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 60 ++++++- ..._work_0000-0003-2760-1191_contributors.xml | 101 ++++++++++++ pom.xml | 12 ++ 20 files changed, 320 insertions(+), 228 deletions(-) rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{create_orcid_authors_data.json => gen_orcid_authors_from_summaries.json} (100%) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{gen_enriched_orcid_works => orcidnodoi}/oozie_app/workflow.xml (95%) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index b81299cd17..624dd7b319 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -51,7 +51,6 @@ org.apache.httpcomponents httpclient - ${org.apache.httpcomponents.version} eu.dnetlib.dhp @@ -87,7 +86,6 @@ org.apache.commons commons-text - ${common.text.version} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index b62ad370e2..bf13db0219 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -62,7 +62,7 @@ public class OrcidDSManager { .toString( OrcidDSManager.class .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); + "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json"))); parser.parseArgument(args); hdfsServerUri = parser.get("hdfsServerUri"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index c73e1efd14..c2cfafd874 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -73,7 +73,7 @@ public class ActivitiesDumpReader { SequenceFile.Writer.valueClass(Text.class))) { while ((entry = tais.getNextTarEntry()) != null) { String filename = entry.getName(); - + StringBuffer buffer = new StringBuffer(); try { if (entry.isDirectory() || !filename.contains("works")) { @@ -83,7 +83,7 @@ public class ActivitiesDumpReader { BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from // tarInput String line; - StringBuffer buffer = new StringBuffer(); + buffer = new StringBuffer(); while ((line = br.readLine()) != null) { buffer.append(line); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index d32e6d945a..d3e9aeaefc 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager { .toString( GenOrcidAuthorWork.class .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json"))); parser.parseArgument(args); hdfsServerUri = parser.get("hdfsServerUri"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 24f0f7a87b..691ca3eee2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaPairRDD summariesRDD = sc - .sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class); + .sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class); Dataset summariesDataset = spark .createDataset( summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), @@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks { Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) .toJavaRDD(); - enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); - logger.info("Works enriched data saved"); +// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); + logger.info("Enriched works RDD ready."); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); final LongAccumulator enrichedPublications = spark @@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks { .write() .format("parquet") .mode(SaveMode.Overwrite) - .save(workingPath + "no_doi_dataset/output"); + .save(workingPath + outputEnrichedWorksPath); logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString()); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 88c84ee89a..6a1468f4c8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.text.Normalizer; import java.util.*; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,7 +41,7 @@ public class AuthorMatcher { int matchCounter = 0; List matchCounters = Arrays.asList(matchCounter); Contributor contributor = null; - contributors.forEach(c -> { + contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> { if (simpleMatch(c.getCreditName(), author.getName()) || simpleMatch(c.getCreditName(), author.getSurname()) || simpleMatch(c.getCreditName(), author.getOtherName())) { @@ -54,6 +55,7 @@ public class AuthorMatcher { Optional optCon = contributors .stream() .filter(c -> c.isSimpleMatch()) + .filter(c -> !StringUtils.isBlank(c.getCreditName())) .map(c -> { c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); return c; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java index c5c1155515..f4b0934020 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi { private static List getContributors(VTDGen vg, VTDNav vn, AutoPilot ap) throws XPathParseException, NavException, XPathEvalException { List contributors = new ArrayList(); - int nameIndex = 0; - ap.selectXPath("//work:contributor/work:credit-name"); + ap.selectXPath("//work:contributors/work:contributor"); while (ap.evalXPath() != -1) { Contributor contributor = new Contributor(); - int t = vn.getText(); - if (t >= 0) { - contributor.setCreditName(vn.toNormalizedString(t)); - contributors.add(nameIndex, contributor); - nameIndex++; + if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) { + int val = vn.getText(); + if (val != -1) { + contributor.setCreditName(vn.toNormalizedString(val)); + } + vn.toElement(VTDNav.PARENT); } - } - if (contributors.size() == 0) { - return contributors; - } - - int sequenceIndex = 0; - ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); - while (ap.evalXPath() != -1) { - int t = vn.getText(); - if (t >= 0) { - contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t)); - sequenceIndex++; - } - } - - int roleIndex = 0; - ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role"); - while (ap.evalXPath() != -1) { - int t = vn.getText(); - if (t >= 0) { - contributors.get(roleIndex).setRole(vn.toNormalizedString(t)); - roleIndex++; + if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) { + if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) { + int val = vn.getText(); + if (val != -1) { + contributor.setSequence(vn.toNormalizedString(val)); + } + vn.toElement(VTDNav.PARENT); + } + if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) { + int val = vn.getText(); + if (val != -1) { + contributor.setRole(vn.toNormalizedString(val)); + } + vn.toElement(VTDNav.PARENT); + } + vn.toElement(VTDNav.PARENT); } + contributors.add(contributor); } return contributors; } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json new file mode 100644 index 0000000000..c3a8f92ecf --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json @@ -0,0 +1,7 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, + {"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, + {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml deleted file mode 100644 index fe14bb8cb5..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - jobTracker - hadoop-rm3.garr-pa1.d4science.org:8032 - - - nameNode - hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - hive_metastore_uris - thrift://hadoop-edge2.garr-pa1.d4science.org:9083 - - - spark2YarnHistoryServerAddress - http://hadoop-edge1.garr-pa1.d4science.org:18089/ - - - spark2EventLogDir - /user/spark/spark2ApplicationHistory - - - spark2ExtraListeners - "com.cloudera.spark.lineage.NavigatorAppListener" - - - spark2SqlQueryExecutionListeners - "com.cloudera.spark.lineage.NavigatorQueryListener" - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml deleted file mode 100644 index 51e00dc0f8..0000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml +++ /dev/null @@ -1,67 +0,0 @@ - - - - workingPath - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz - - the shell command that downloads and puts to hdfs orcid summaries - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidDSManager - -w${workingPath}/ - -n${nameNode} - -fORCID_2019_summaries.tar.gz - -osummaries/output/ - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml index 3068562d06..05fe6d014f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml @@ -9,7 +9,7 @@ oozie.launcher.mapreduce.map.java.opts - -Xmx4g + -Xmx2g jobTracker diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml index 8f9a5123e6..ea4d33296c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + workingPath @@ -6,70 +6,70 @@ shell_cmd_0 - wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz + wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz the shell command that downloads and puts to hdfs orcid activity file 0 shell_cmd_1 - wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz + wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz the shell command that downloads and puts to hdfs orcid activity file 1 shell_cmd_2 - wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz + wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz the shell command that downloads and puts to hdfs orcid activity file 2 shell_cmd_3 - wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz + wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz the shell command that downloads and puts to hdfs orcid activity file 3 - + shell_cmd_4 - wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz + wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz the shell command that downloads and puts to hdfs orcid activity file 4 - + shell_cmd_5 - wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz + wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz the shell command that downloads and puts to hdfs orcid activity file 5 - + shell_cmd_6 - wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz + wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz the shell command that downloads and puts to hdfs orcid activity file 6 shell_cmd_7 - wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz + wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz the shell command that downloads and puts to hdfs orcid activity file 7 shell_cmd_8 - wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz + wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz the shell command that downloads and puts to hdfs orcid activity file 8 shell_cmd_9 - wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz + wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz the shell command that downloads and puts to hdfs orcid activity file 9 - + shell_cmd_X - wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz + wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz the shell command that downloads and puts to hdfs orcid activity file X - + @@ -82,11 +82,11 @@ - + - + @@ -102,8 +102,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))} @@ -118,7 +118,7 @@ ${shell_cmd_0} - + @@ -129,7 +129,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_0.tar.gz + -fORCID_2020_10_activites_0.tar.gz -owno_doi_works/works_0.seq -oewno_doi_enriched_works/ @@ -139,8 +139,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))} @@ -155,7 +155,7 @@ ${shell_cmd_1} - + @@ -166,7 +166,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_1.tar.gz + -fORCID_2020_10_activites_1.tar.gz -owno_doi_works/works_1.seq -oewno_doi_enriched_works/ @@ -176,8 +176,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))} @@ -192,7 +192,7 @@ ${shell_cmd_2} - + @@ -203,7 +203,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_2.tar.gz + -fORCID_2020_10_activites_2.tar.gz -owno_doi_works/works_2.seq -oewno_doi_enriched_works/ @@ -213,8 +213,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))} @@ -229,7 +229,7 @@ ${shell_cmd_3} - + @@ -240,7 +240,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_3.tar.gz + -fORCID_2020_10_activites_3.tar.gz -owno_doi_works/works_3.seq -oewno_doi_enriched_works/ @@ -250,8 +250,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))} @@ -266,7 +266,7 @@ ${shell_cmd_4} - + @@ -277,7 +277,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_4.tar.gz + -fORCID_2020_10_activites_4.tar.gz -owno_doi_works/works_4.seq -oewno_doi_enriched_works/ @@ -287,8 +287,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))} @@ -303,7 +303,7 @@ ${shell_cmd_5} - + @@ -314,7 +314,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_5.tar.gz + -fORCID_2020_10_activites_5.tar.gz -owno_doi_works/works_5.seq -oewno_doi_enriched_works/ @@ -324,8 +324,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))} @@ -340,7 +340,7 @@ ${shell_cmd_6} - + @@ -351,7 +351,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_6.tar.gz + -fORCID_2020_10_activites_6.tar.gz -owno_doi_works/works_6.seq -oewno_doi_enriched_works/ @@ -362,8 +362,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))} @@ -378,7 +378,7 @@ ${shell_cmd_7} - + @@ -389,7 +389,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_7.tar.gz + -fORCID_2020_10_activites_7.tar.gz -owno_doi_works/works_7.seq -oewno_doi_enriched_works/ @@ -399,8 +399,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))} @@ -415,7 +415,7 @@ ${shell_cmd_8} - + @@ -426,7 +426,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_8.tar.gz + -fORCID_2020_10_activites_8.tar.gz -owno_doi_works/works_8.seq -oewno_doi_enriched_works/ @@ -436,8 +436,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))} @@ -452,7 +452,7 @@ ${shell_cmd_9} - + @@ -463,7 +463,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_9.tar.gz + -fORCID_2020_10_activites_9.tar.gz -owno_doi_works/works_9.seq -oewno_doi_enriched_works/ @@ -473,8 +473,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))} @@ -489,7 +489,7 @@ ${shell_cmd_X} - + @@ -500,7 +500,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_X.tar.gz + -fORCID_2020_10_activites_X.tar.gz -owno_doi_works/works_X.seq -oewno_doi_enriched_works/ @@ -508,7 +508,35 @@ + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml index e77dd09c9d..e1829e8479 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml @@ -19,4 +19,8 @@ oozie.launcher.mapreduce.user.classpath.first true + + oozie.launcher.mapreduce.map.java.opts + -Xmx16g + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml index 3362cc67b7..8517f35ee0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + workingPath @@ -6,7 +6,7 @@ shell_cmd_0 - wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz + wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz the shell command that downloads and puts to hdfs orcid summaries @@ -21,8 +21,8 @@ - - + + @@ -31,7 +31,7 @@ - ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} + ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))} @@ -57,8 +57,8 @@ eu.dnetlib.doiboost.orcid.OrcidDSManager -w${workingPath}/ -n${nameNode} - -fORCID_2019_summaries.tar.gz - -osummaries/output/ + -fORCID_2020_10_summaries.tar.gz + -oauthors/ diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml similarity index 95% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index faed3104a6..6cec48a6d3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -59,7 +59,7 @@ - + @@ -85,7 +85,7 @@ -n${nameNode} -f- -owno_doi_works/ - -oewno_doi_enriched_works/output + -oewno_doi_dataset diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 5e0f91ecd9..774475626d 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -38,8 +38,8 @@ public class OrcidClientTest { @Test public void downloadTest() throws Exception { - String record = testDownloadRecord("0000-0002-2536-4498"); - File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml"); + String record = testDownloadRecord("0000-0001-6163-2042"); + File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml"); OutputStream outStream = new FileOutputStream(f); IOUtils.write(record.getBytes(), outStream); System.out.println("saved to tmp"); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index bf5aba99b0..fa2980ac4c 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -2,15 +2,20 @@ package eu.dnetlib.doiboost.orcidnodoi.xml; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.text.Normalizer; import java.util.*; +import javax.validation.constraints.AssertTrue; + import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.junit.jupiter.api.Test; +import org.mortbay.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +46,6 @@ public class OrcidNoDoiTest { String orcidIdA = "0000-0003-2760-1191"; @Test -// @Ignore public void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); @@ -95,8 +99,7 @@ public class OrcidNoDoiTest { } @Test -// @Ignore - private void authorMatchTest() throws Exception { + public void authorMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); @@ -121,9 +124,60 @@ public class OrcidNoDoiTest { logger.error("parsing xml", e); } assertNotNull(workData); + + Contributor a = workData.getContributors().get(0); + assertTrue(a.getCreditName().equals("Abdel-Dayem K")); + AuthorMatcher.match(author, workData.getContributors()); GsonBuilder builder = new GsonBuilder(); Gson gson = builder.create(); logger.info(gson.toJson(workData)); + + assertTrue(workData.getContributors().size() == 6); + Contributor c = workData.getContributors().get(0); + assertTrue(c.getOid().equals("0000-0003-2760-1191")); + assertTrue(c.getName().equals("Khairy")); + assertTrue(c.getSurname().equals("Abdel Dayem")); + assertTrue(c.getCreditName().equals("Abdel-Dayem K")); + } + + @Test + public void readContributorsTest() + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + logger.info("running loadPublicationFieldsTest ...."); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml")); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData.getContributors()); + assertTrue(workData.getContributors().size() == 5); + assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName())); + assertTrue(workData.getContributors().get(0).getSequence().equals("seq0")); + assertTrue(workData.getContributors().get(0).getRole().equals("role0")); + assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence())); + assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole())); + assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2")); + assertTrue(workData.getContributors().get(2).getSequence().equals("seq2")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole())); + assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence())); + assertTrue(workData.getContributors().get(3).getRole().equals("role3")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName())); + assertTrue(workData.getContributors().get(4).getSequence().equals("seq4")); + assertTrue(workData.getContributors().get(4).getRole().equals("role4")); } } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml new file mode 100644 index 0000000000..26e64aedae --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml @@ -0,0 +1,101 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + + seq0 + role0 + + + + creditname1 + + + creditname2 + + seq2 + + + + + creditname3 + + + role3 + + + + + + seq4 + role4 + + + + diff --git a/pom.xml b/pom.xml index d64de01ac2..3629e2f1bd 100644 --- a/pom.xml +++ b/pom.xml @@ -458,6 +458,18 @@ ${jsonschemagenerator.version} + + org.apache.commons + commons-text + ${common.text.version} + + + + org.apache.httpcomponents + httpclient + ${org.apache.httpcomponents.version} + +