From fcbb4c148929c06756756f18a14cece1a5c232b3 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 24 Jun 2020 16:29:32 +0200 Subject: [PATCH 01/34] parser of orcid publication data from xml original dump --- dhp-workflows/dhp-doiboost/pom.xml | 6 + .../doiboost/orcid/model/AuthorData.java | 9 + .../orcidnodoi/model/Contributor.java | 54 +++ .../doiboost/orcidnodoi/model/ExternalId.java | 32 ++ .../orcidnodoi/model/PublicationDate.java | 32 ++ .../orcidnodoi/model/WorkDataNoDoi.java | 101 ++++++ .../orcidnodoi/xml/XMLRecordParserNoDoi.java | 216 ++++++++++++ .../orcid/xml/XMLRecordParserTest.java | 2 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 326 ++++++++++++++++++ .../xml/activity_work_0000-0003-2760-1191.xml | 106 ++++++ .../xml/activity_work_0000-0002-5982-8983.xml | 0 ...ty_work_0000-0003-2760-1191-similarity.xml | 113 ++++++ .../xml/activity_work_0000-0003-2760-1191.xml | 106 ++++++ 13 files changed, 1102 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml rename dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/{orcid => orcidnodoi}/xml/activity_work_0000-0002-5982-8983.xml (100%) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 39bb81ec1..2662d0a39 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -84,6 +84,12 @@ spark-sql_2.11 + + org.apache.commons + commons-text + 1.8 + + diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java index 29551c347..87f1f65c8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java @@ -9,6 +9,7 @@ public class AuthorData implements Serializable { private String name; private String surname; private String creditName; + private String otherName; private String errorCode; public String getErrorCode() { @@ -50,4 +51,12 @@ public class AuthorData implements Serializable { public void setOid(String oid) { this.oid = oid; } + + public String getOtherName() { + return otherName; + } + + public void setOtherName(String otherName) { + this.otherName = otherName; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java new file mode 100644 index 000000000..42076de5d --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +import java.io.Serializable; + +import eu.dnetlib.doiboost.orcid.model.AuthorData; + +public class Contributor extends AuthorData implements Serializable { + private String sequence; + private String role; + private boolean simpleMatch = false; + private Double score = 0.0; + private boolean bestMatch = false; + + public String getSequence() { + return sequence; + } + + public void setSequence(String sequence) { + this.sequence = sequence; + } + + public String getRole() { + return role; + } + + public void setRole(String role) { + this.role = role; + } + + public boolean isSimpleMatch() { + return simpleMatch; + } + + public void setSimpleMatch(boolean simpleMatch) { + this.simpleMatch = simpleMatch; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public boolean isBestMatch() { + return bestMatch; + } + + public void setBestMatch(boolean bestMatch) { + this.bestMatch = bestMatch; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java new file mode 100644 index 000000000..865e54ae3 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +public class ExternalId { + private String type; + private String value; + private String relationShip; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public String getRelationShip() { + return relationShip; + } + + public void setRelationShip(String relationShip) { + this.relationShip = relationShip; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java new file mode 100644 index 000000000..9282a80ba --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +public class PublicationDate { + private String year; + private String month; + private String day; + + public String getYear() { + return year; + } + + public void setYear(String year) { + this.year = year; + } + + public String getMonth() { + return month; + } + + public void setMonth(String month) { + this.month = month; + } + + public String getDay() { + return day; + } + + public void setDay(String day) { + this.day = day; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java new file mode 100644 index 000000000..ee13454e1 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java @@ -0,0 +1,101 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +import java.io.Serializable; +import java.util.List; + +public class WorkDataNoDoi implements Serializable { + + private String oid; + private String id; + private String sourceName; + private String type; + private List titles; + private List urls; + List extIds; + List publicationDates; + List contributors; + + public String getOid() { + return oid; + } + + public void setOid(String oid) { + this.oid = oid; + } + + public String getErrorCode() { + return errorCode; + } + + public void setErrorCode(String errorCode) { + this.errorCode = errorCode; + } + + private String errorCode; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getTitles() { + return titles; + } + + public void setTitles(List titles) { + this.titles = titles; + } + + public String getSourceName() { + return sourceName; + } + + public void setSourceName(String sourceName) { + this.sourceName = sourceName; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public List getUrls() { + return urls; + } + + public void setUrls(List urls) { + this.urls = urls; + } + + public List getExtIds() { + return extIds; + } + + public void setExtIds(List extIds) { + this.extIds = extIds; + } + + public List getPublicationDates() { + return publicationDates; + } + + public void setPublicationDates(List publicationDates) { + this.publicationDates = publicationDates; + } + + public List getContributors() { + return contributors; + } + + public void setContributors(List contributors) { + this.contributors = contributors; + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java new file mode 100644 index 000000000..6e5771547 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -0,0 +1,216 @@ + +package eu.dnetlib.doiboost.orcidnodoi.xml; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ximpleware.*; + +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId; +import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; + +public class XMLRecordParserNoDoi { + + private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class); + + private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common"; + private static final String NS_COMMON = "common"; + private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person"; + private static final String NS_PERSON = "person"; + private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details"; + private static final String NS_DETAILS = "personal-details"; + private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name"; + private static final String NS_OTHER = "other-name"; + private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; + private static final String NS_RECORD = "record"; + private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; + + private static final String NS_WORK = "work"; + private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; + + private static final String NS_ERROR = "error"; + + public static WorkDataNoDoi VTDParseWorkData(byte[] bytes) + throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException, + NavException, XPathEvalException { + logger.info("parsing xml ..."); + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + + WorkDataNoDoi workData = new WorkDataNoDoi(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + workData.setErrorCode(errors.get(0)); + return workData; + } + + List workNodes = VtdUtilityParser + .getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code")); + if (!workNodes.isEmpty()) { + final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1]; + workData.setOid(oid); + final String id = (workNodes.get(0).getAttributes().get("put-code")); + workData.setId(id); + } else { + return null; + } + + final List titles = VtdUtilityParser + .getTextValue( + ap, vn, "//common:title"); + if (!titles.isEmpty()) { + workData.setTitles(titles); + } + + final List sourceNames = VtdUtilityParser + .getTextValue( + ap, vn, "//common:source-name"); + if (!sourceNames.isEmpty()) { + workData.setSourceName(sourceNames.get(0)); + } + + final List types = VtdUtilityParser + .getTextValue( + ap, vn, "//work:type"); + if (!types.isEmpty()) { + workData.setType(types.get(0)); + } + + final List urls = VtdUtilityParser + .getTextValue( + ap, vn, "//common:url"); + if (!urls.isEmpty()) { + workData.setUrls(urls); + } + + workData.setPublicationDates(getPublicationDates(vg, vn, ap)); + workData.setExtIds(getExternalIds(vg, vn, ap)); + workData.setContributors(getContributors(vg, vn, ap)); + return workData; + + } + + private static List getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap) + throws XPathParseException, NavException, XPathEvalException { + List publicationDates = new ArrayList(); + int yearIndex = 0; + ap.selectXPath("//common:publication-date/common:year"); + while (ap.evalXPath() != -1) { + PublicationDate publicationDate = new PublicationDate(); + int t = vn.getText(); + if (t >= 0) { + publicationDate.setYear(vn.toNormalizedString(t)); + publicationDates.add(yearIndex, publicationDate); + yearIndex++; + } + } + int monthIndex = 0; + ap.selectXPath("//common:publication-date/common:month"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t)); + monthIndex++; + } + } + int dayIndex = 0; + ap.selectXPath("//common:publication-date/common:day"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t)); + dayIndex++; + } + } + return publicationDates; + } + + private static List getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap) + throws XPathParseException, NavException, XPathEvalException { + List extIds = new ArrayList(); + int typeIndex = 0; + ap.selectXPath("//common:external-id/common:external-id-type"); + while (ap.evalXPath() != -1) { + ExternalId extId = new ExternalId(); + int t = vn.getText(); + if (t >= 0) { + extId.setType(vn.toNormalizedString(t)); + extIds.add(typeIndex, extId); + typeIndex++; + } + } + int valueIndex = 0; + ap.selectXPath("//common:external-id/common:external-id-value"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + extIds.get(valueIndex).setValue(vn.toNormalizedString(t)); + valueIndex++; + } + } + int relationshipIndex = 0; + ap.selectXPath("//common:external-id/common:external-id-relationship"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t)); + relationshipIndex++; + } + } + if (typeIndex == valueIndex) { + return extIds; + } + return new ArrayList(); + } + + private static List getContributors(VTDGen vg, VTDNav vn, AutoPilot ap) + throws XPathParseException, NavException, XPathEvalException { + List contributors = new ArrayList(); + int nameIndex = 0; + ap.selectXPath("//work:contributor/work:credit-name"); + while (ap.evalXPath() != -1) { + Contributor contributor = new Contributor(); + int t = vn.getText(); + if (t >= 0) { + contributor.setCreditName(vn.toNormalizedString(t)); + contributors.add(nameIndex, contributor); + nameIndex++; + } + } + + int sequenceIndex = 0; + ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t)); + sequenceIndex++; + } + } + + int roleIndex = 0; + ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + contributors.get(roleIndex).setRole(vn.toNormalizedString(t)); + roleIndex++; + } + } + return contributors; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index d5da4eec0..4d8237f77 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -44,7 +44,7 @@ public class XMLRecordParserTest { String xml = IOUtils .toString( - this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml")); + this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); XMLRecordParser p = new XMLRecordParser(); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java new file mode 100644 index 000000000..31f8432ac --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -0,0 +1,326 @@ + +package eu.dnetlib.doiboost.orcidnodoi.xml; + +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import jdk.nashorn.internal.ir.annotations.Ignore; +import org.apache.commons.io.IOUtils; +import org.apache.commons.text.similarity.JaccardSimilarity; +import org.apache.commons.text.similarity.JaroWinklerSimilarity; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.text.Normalizer; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class OrcidNoDoiTest { + + private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class); + + String nameA = "Khairy"; + String surnameA = "Abdel Dayem"; + String otherNameA = "Dayem MKA"; + String nameB = "K"; + String surnameB = "Abdel-Dayem"; + String orcidIdA = "0000-0003-2760-1191"; + Double threshold = 0.8; + + @Test + @Ignore + private void similarityTest() throws Exception { + logger.info("running testSimilarity ...."); + logger + .info( + "JaroWinklerSimilarity: " + + Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB))); + logger + .info( + "JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB))); + } + + @Test + @Ignore + private void bestMatchTest() throws Exception { + logger.info("running bestMatchTest ...."); + String contributor = surnameB + ", " + nameB; + logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor))); + } + + private static Double bestMatch(String authorSurname, String authorName, String contributor) { + logger.debug(authorSurname + " " + authorName + " vs " + contributor); + String[] contributorSplitted = contributor.split(" "); + if (contributorSplitted.length == 0) { + return 0.0; + } + final String contributorName = contributorSplitted[contributorSplitted.length - 1]; + String contributorSurname = ""; + if (contributorSplitted.length > 1) { + StringJoiner joiner = new StringJoiner(" "); + for (int i = 0; i < contributorSplitted.length - 1; i++) { + joiner.add(contributorSplitted[i]); + } + contributorSurname = joiner.toString(); + } + logger + .debug( + "contributorName: " + contributorName + + " contributorSurname: " + contributorSurname); + String authorNameNrm = normalize(authorName); + String authorSurnameNrm = normalize(authorSurname); + String contributorNameNrm = normalize(contributorName); + String contributorSurnameNrm = normalize(contributorSurname); + Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); + Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); + if (sm1.compareTo(sm2) >= 0) { + return sm1; + } + return sm2; + } + + private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { + Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); + logger + .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); + return score; + } + + private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) { + return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); + } + + private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { + return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); + } + + private static String parse(String name, String surname) { + return surname + " " + name; + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + @Test + @Ignore + public void readPublicationFieldsTest() + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + logger.info("running loadPublicationFieldsTest ...."); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData); + assertNotNull(workData.getOid()); + logger.info("oid: " + workData.getOid()); + assertNotNull(workData.getTitles()); + logger.info("titles: "); + workData.getTitles().forEach(t -> { + logger.info(t); + }); + logger.info("source: " + workData.getSourceName()); + logger.info("type: " + workData.getType()); + logger.info("urls: "); + workData.getUrls().forEach(u -> { + logger.info(u); + }); + logger.info("publication date: "); + workData.getPublicationDates().forEach(d -> { + logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay()); + }); + logger.info("external id: "); + workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self")); + workData.getExtIds().forEach(e -> { + logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip()); + }); + logger.info("contributors: "); + workData.getContributors().forEach(c -> { + logger + .info( + c.getName() + " - " + c.getRole() + " - " + c.getSequence()); + }); + + } + + private void updateRanks(List contributors) { + boolean seqFound = false; + if (contributors + .stream() + .filter( + c -> c.getRole() != null && c.getSequence() != null && + c.getRole().equals("author") && (c.getSequence().equals("first") || + c.getSequence().equals("additional"))) + .count() > 0) { + seqFound = true; + logger.info("sequence data found"); + } + if (!seqFound) { + List seqIds = Arrays.asList(0); + contributors.forEach(c -> { + int currentSeq = seqIds.get(0) + 1; + seqIds.set(0, currentSeq); + c.setSequence(Integer.toString(seqIds.get(0))); + }); + } + } + + private void updateAuthorsSimpleMatch(List contributors, AuthorData author) { + contributors.forEach(c -> { + if (c.isSimpleMatch()) { + logger.info("simple match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + } + }); + updateRanks(contributors); + } + + private void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { + logger.info("inside updateAuthorsSimilarityMatch ..."); + contributors.forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " + + c.isSimpleMatch()); + }); + + contributors + .stream() + .filter(c -> c.isBestMatch()) + .forEach(c -> { + logger.info("similarity match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + }); + updateRanks(contributors); + } + + @Test + @Ignore + public void authorSimilarityMatchTest() throws Exception { + logger.info("running authorSimilarityMatchTest ...."); + authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml"); + } + + @Test + private void authorSimpleMatchTest() throws Exception { + logger.info("running authorSimpleMatchTest ...."); + authorMatchTest("activity_work_0000-0003-2760-1191.xml"); + } + + private void authorMatchTest(String orcidWork) + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + AuthorData author = new AuthorData(); + author.setName(nameA); + author.setSurname(surnameA); + author.setOid(orcidIdA); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData); + int matchCounter = 0; + List matchCounters = Arrays.asList(matchCounter); + Contributor contributor = null; + workData.getContributors().forEach(c -> { + if (normalize(c.getCreditName()).contains(normalize(author.getName())) || + normalize(c.getCreditName()).contains(normalize(author.getSurname())) || + ((author.getOtherName() != null) + && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); + logger.info("match counter: " + Integer.toString(matchCounters.get(0))); + if (matchCounters.get(0) == 1) { + updateAuthorsSimpleMatch(workData.getContributors(), author); + } else if (matchCounters.get(0) > 1) { + Optional optCon = workData + .getContributors() + .stream() + .filter(c -> c.isSimpleMatch()) + .map(c -> { + c.setScore(bestMatch(nameA, surnameA, c.getCreditName())); + logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); + return c; + }) + .filter(c -> c.getScore() >= threshold) + .max(Comparator.comparing(c -> c.getScore())); + Contributor bestMatchContributor = null; + if (optCon.isPresent()) { + bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + logger.info("best match: " + bestMatchContributor.getCreditName()); + updateAuthorsSimilarityMatch(workData.getContributors(), author); + } + + } + + logger.info("UPDATED contributors: "); + workData.getContributors().forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence()); + }); + } +} + +// +// orcid_RDD = sc.textFile(ORCID_DUMP_PATH) +// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec") +// \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml new file mode 100644 index 000000000..485f4f8e8 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml @@ -0,0 +1,106 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml rename to dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml new file mode 100644 index 000000000..650d5a4cb --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml @@ -0,0 +1,113 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Abdel-Dayem Fake + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml new file mode 100644 index 000000000..485f4f8e8 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml @@ -0,0 +1,106 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + From d6498278edc87aeb15ee61b33edf7f280829b56a Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 25 Jun 2020 18:43:29 +0200 Subject: [PATCH 02/34] added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) --- .../orcid/ActivitiesDecompressor.java | 2 +- .../doiboost/orcid/SummariesDecompressor.java | 2 +- .../doiboost/orcid/json/JsonHelper.java | 16 + .../orcidnodoi/ActivitiesDumpReader.java | 149 +++++ .../orcidnodoi/GenOrcidAuthorWork.java | 52 ++ .../SparkGenEnrichedOrcidWorks.java | 119 ++++ .../json/JsonWriter.java | 2 +- .../orcidnodoi/model/Contributor.java | 6 +- .../orcidnodoi/model/WorkDataNoDoi.java | 1 - .../orcidnodoi/similarity/AuthorMatcher.java | 204 +++++++ .../oozie_app/config-default.xml | 22 + .../oozie_app/workflow.xml | 524 ++++++++++++++++++ .../gen_enriched_orcid_works_parameters.json | 7 + .../orcidnodoi/xml/OrcidNoDoiTest.java | 250 +-------- 14 files changed, 1125 insertions(+), 231 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java rename dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/{orcid => orcidnodoi}/json/JsonWriter.java (94%) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 570fdef17..80ccd71a1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index f0bbb5c32..603bfedf6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java new file mode 100644 index 000000000..13a3cee8f --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java @@ -0,0 +1,16 @@ + +package eu.dnetlib.doiboost.orcid.json; + +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; + +public class JsonHelper { + + public static String createOidWork(WorkDataNoDoi workData) { + JsonObject oidWork = new JsonObject(); + oidWork.addProperty("oid", workData.getOid()); + oidWork.addProperty("work", new Gson().toJson(workData)); + return oidWork.toString(); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java new file mode 100644 index 000000000..7eb6faf54 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -0,0 +1,149 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import eu.dnetlib.doiboost.orcid.json.JsonHelper; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.mortbay.log.Log; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URI; + +public class ActivitiesDumpReader { + + private static final int MAX_XML_WORKS_PARSED = -1; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; + + public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + parseTarActivities(fs, conf, gzipInputStream, outputPath); + + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } + + private static void parseTarActivities( + FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { + int counter = 0; + int noDoiFound = 0; + int errorFromOrcidFound = 0; + int xmlParserErrorFound = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + + try { + if (entry.isDirectory() || !filename.contains("works")) { + + } else { + Log.debug("XML work entry name: " + entry.getName()); + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from + // tarInput + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes()); + if (workDataNoDoi != null) { + if (workDataNoDoi.getErrorCode() != null) { + errorFromOrcidFound += 1; + Log + .debug( + "error from Orcid with code " + + workDataNoDoi.getErrorCode() + + " for entry " + + entry.getName()); + continue; + } + boolean isDoiFound = workDataNoDoi.getExtIds().stream() + .filter(e -> e.getType()!=null) + .anyMatch(e -> e.getType().equals("doi")); + if (!isDoiFound) { + String jsonData = JsonHelper.createOidWork(workDataNoDoi); + Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData); + + final Text key = new Text(workDataNoDoi.getOid()); + final Text value = new Text(jsonData); + + try { + writer.append(key, value); + } catch (IOException e) { + Log.debug("Writing to sequence file: " + e.getMessage()); + Log.debug(e); + throw new RuntimeException(e); + } + noDoiFound += 1; + } + + } else { + Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); + xmlParserErrorFound += 1; + } + } + } catch (Exception e) { + Log + .warn( + "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); + Log.warn(e); + } + + if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { + Log.info("Current xml works parsed: " + counter); + } + + if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) { + break; + } + } + } + } catch (IOException e) { + Log.warn("Parsing work from gzip archive: " + e.getMessage()); + Log.warn(e); + throw new RuntimeException(e); + } + Log.info("Activities parse completed"); + Log.info("Total XML works parsed: " + counter); + Log.info("Total no doi work found: " + noDoiFound); + Log.info("Error from Orcid found: " + errorFromOrcidFound); + Log.info("Error parsing xml work found: " + xmlParserErrorFound); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java new file mode 100644 index 000000000..b82f4bc4c --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.OrcidDSManager; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +import java.io.IOException; + +public class GenOrcidAuthorWork extends OrcidDSManager { + + private String activitiesFileNameTarGz; + private String outputWorksPath; + private String workingPath; + + public static void main(String[] args) throws IOException, Exception { + GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); + genOrcidAuthorWork.loadArgs(args); + genOrcidAuthorWork.generateAuthorsDOIsData(); + } + + public void generateAuthorsDOIsData() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath)); + ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenOrcidAuthorWork.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); + activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); + Log.info("Activities File Name: " + activitiesFileNameTarGz); + outputWorksPath = parser.get("outputWorksPath"); + Log.info("Output Author Work Data: " + outputWorksPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java new file mode 100644 index 000000000..6bb31bcf6 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -0,0 +1,119 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.io.IOException; +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +public class SparkGenEnrichedOrcidWorks { + + public static void main(String[] args) throws IOException, Exception { + Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); + logger.info("[ SparkGenerateDoiAuthorList STARTED]"); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenEnrichedOrcidWorks.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + parser.parseArgument(args); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + final String workingPath = parser.get("workingPath"); + logger.info("workingPath: ", workingPath); + final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath"); + logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath); + final String outputWorksPath = parser.get("outputWorksPath"); + logger.info("outputWorksPath: ", outputWorksPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaPairRDD summariesRDD = sc + .sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class); + Dataset summariesDataset = spark + .createDataset( + summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), + Encoders.bean(AuthorData.class)); + + JavaPairRDD activitiesRDD = sc + .sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class); + Dataset activitiesDataset = spark + .createDataset( + activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), + Encoders.bean(WorkDataNoDoi.class)); + + activitiesDataset + .joinWith( + summariesDataset, + activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") + .map( + (MapFunction, Tuple2>) value -> { + WorkDataNoDoi w = value._1; + AuthorData a = value._2; + AuthorMatcher.match(a, w.getContributors()); + return new Tuple2<>(a.getOid(), w); + }, + Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) + .filter(Objects::nonNull) + .toJavaRDD() + .saveAsTextFile(workingPath + outputEnrichedWorksPath);; + }); + } + + private static AuthorData loadAuthorFromJson(Text orcidId, Text json) { + AuthorData authorData = new AuthorData(); + authorData.setOid(orcidId.toString()); + JsonElement jElement = new JsonParser().parse(json.toString()); + authorData.setName(getJsonValue(jElement, "name")); + authorData.setSurname(getJsonValue(jElement, "surname")); + authorData.setCreditName(getJsonValue(jElement, "creditname")); + return authorData; + } + + private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) { + WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class); + return workData; + } + + private static String getJsonValue(JsonElement jElement, String property) { + if (jElement.getAsJsonObject().has(property)) { + JsonElement name = null; + name = jElement.getAsJsonObject().get(property); + if (name != null && !name.isJsonNull()) { + return name.getAsString(); + } + } + return null; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java similarity index 94% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java index 35676d5ba..7f7e3a10a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java @@ -1,5 +1,5 @@ -package eu.dnetlib.doiboost.orcid.json; +package eu.dnetlib.doiboost.orcidnodoi.json; import com.google.gson.JsonObject; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java index 42076de5d..8a170de09 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData; public class Contributor extends AuthorData implements Serializable { private String sequence; private String role; - private boolean simpleMatch = false; - private Double score = 0.0; - private boolean bestMatch = false; + private transient boolean simpleMatch = false; + private transient Double score = 0.0; + private transient boolean bestMatch = false; public String getSequence() { return sequence; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java index ee13454e1..5756521e7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java @@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable { public void setContributors(List contributors) { this.contributors = contributors; } - } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java new file mode 100644 index 000000000..09fd8b36b --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -0,0 +1,204 @@ + +package eu.dnetlib.doiboost.orcidnodoi.similarity; + +import java.io.IOException; +import java.text.Normalizer; +import java.util.*; + +import org.apache.commons.text.similarity.JaroWinklerSimilarity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; + +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; + +public class AuthorMatcher { + + private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); + private static final Double threshold = 0.8; + + public static void match(AuthorData author, List contributors) + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + + int matchCounter = 0; + List matchCounters = Arrays.asList(matchCounter); + Contributor contributor = null; + contributors.forEach(c -> { + if (normalize(c.getCreditName()).contains(normalize(author.getName())) || + normalize(c.getCreditName()).contains(normalize(author.getSurname())) || + ((author.getOtherName() != null) + && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); + logger.info("match counter: " + Integer.toString(matchCounters.get(0))); + if (matchCounters.get(0) == 1) { + updateAuthorsSimpleMatch(contributors, author); + } else if (matchCounters.get(0) > 1) { + Optional optCon = contributors + .stream() + .filter(c -> c.isSimpleMatch()) + .map(c -> { + c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); + logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); + return c; + }) + .filter(c -> c.getScore() >= threshold) + .max(Comparator.comparing(c -> c.getScore())); + Contributor bestMatchContributor = null; + if (optCon.isPresent()) { + bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + logger.info("best match: " + bestMatchContributor.getCreditName()); + updateAuthorsSimilarityMatch(contributors, author); + } + + } + + logger.info("UPDATED contributors: "); + contributors.forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence()); + }); + } + + private static Double bestMatch(String authorSurname, String authorName, String contributor) { + logger.debug(authorSurname + " " + authorName + " vs " + contributor); + String[] contributorSplitted = contributor.split(" "); + if (contributorSplitted.length == 0) { + return 0.0; + } + final String contributorName = contributorSplitted[contributorSplitted.length - 1]; + String contributorSurname = ""; + if (contributorSplitted.length > 1) { + StringJoiner joiner = new StringJoiner(" "); + for (int i = 0; i < contributorSplitted.length - 1; i++) { + joiner.add(contributorSplitted[i]); + } + contributorSurname = joiner.toString(); + } + logger + .debug( + "contributorName: " + contributorName + + " contributorSurname: " + contributorSurname); + String authorNameNrm = normalize(authorName); + String authorSurnameNrm = normalize(authorSurname); + String contributorNameNrm = normalize(contributorName); + String contributorSurnameNrm = normalize(contributorSurname); + Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); + Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); + if (sm1.compareTo(sm2) >= 0) { + return sm1; + } + return sm2; + } + + private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { + Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); + logger + .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); + return score; + } + + private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { + return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + private static String parse(String name, String surname) { + return surname + " " + name; + } + + private static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { + contributors.forEach(c -> { + if (c.isSimpleMatch()) { + logger.info("simple match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + } + }); + updateRanks(contributors); + } + + private static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { + logger.info("inside updateAuthorsSimilarityMatch ..."); + contributors.forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " + + c.isSimpleMatch()); + }); + + contributors + .stream() + .filter(c -> c.isBestMatch()) + .forEach(c -> { + logger.info("similarity match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + }); + updateRanks(contributors); + } + + private static void updateRanks(List contributors) { + boolean seqFound = false; + if (contributors + .stream() + .filter( + c -> c.getRole() != null && c.getSequence() != null && + c.getRole().equals("author") && (c.getSequence().equals("first") || + c.getSequence().equals("additional"))) + .count() > 0) { + seqFound = true; + logger.info("sequence data found"); + } + if (!seqFound) { + List seqIds = Arrays.asList(0); + contributors.forEach(c -> { + int currentSeq = seqIds.get(0) + 1; + seqIds.set(0, currentSeq); + c.setSequence(Integer.toString(seqIds.get(0))); + }); + } + } + + private static String toJson(WorkDataNoDoi work) { + GsonBuilder builder = new GsonBuilder(); + Gson gson = builder.create(); + return gson.toJson(work); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml new file mode 100644 index 000000000..f2d51e260 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + oozie.action.sharelib.for.java + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + + + jobTracker + hadoop-rm3.garr-pa1.d4science.org:8032 + + + nameNode + hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml new file mode 100644 index 000000000..2486bdb24 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -0,0 +1,524 @@ + + + + workingPath_activities + the working dir base path + + + shell_cmd_0 + wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 0 + + + shell_cmd_1 + wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 1 + + + shell_cmd_2 + wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 2 + + + shell_cmd_3 + wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 3 + + + shell_cmd_4 + wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 4 + + + shell_cmd_5 + wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 5 + + + shell_cmd_6 + wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 6 + + + shell_cmd_7 + wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 7 + + + shell_cmd_8 + wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 8 + + + shell_cmd_9 + wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 9 + + + shell_cmd_X + wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file X + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_0.tar.gz + -owno_doi_works/works_0.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_1} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_1.tar.gz + -owno_doi_works/works_1.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_2} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_2.tar.gz + -owno_doi_works/works_2.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_3} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_3.tar.gz + -owno_doi_works/works_3.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_4} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_4.tar.gz + -owno_doi_works/works_4.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_5} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_5.tar.gz + -owno_doi_works/works_5.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_6} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_6.tar.gz + -owno_doi_works/works_6.seq + + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_7} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_7.tar.gz + -owno_doi_works/works_7.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_8} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_8.tar.gz + -owno_doi_works/works_8.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_9} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_9.tar.gz + -owno_doi_works/works_9.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_X} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_X.tar.gz + -owno_doi_works/works_X.seq + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn + cluster + Gen_Enriched_Orcid_Works + eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks + dhp-doiboost-1.2.3-SNAPSHOT.jar + --num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + + -w${workingPath}/ + -owno_doi_works/ + -oewno_doi_enriched_works/ + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json new file mode 100644 index 000000000..c3a8f92ec --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json @@ -0,0 +1,7 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, + {"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, + {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index 31f8432ac..6a5faddbd 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -1,15 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi.xml; -import com.ximpleware.NavException; -import com.ximpleware.ParseException; -import com.ximpleware.XPathEvalException; -import com.ximpleware.XPathParseException; -import eu.dnetlib.dhp.parser.utility.VtdException; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import jdk.nashorn.internal.ir.annotations.Ignore; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.IOException; +import java.text.Normalizer; +import java.util.*; + import org.apache.commons.io.IOUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; @@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.text.Normalizer; -import java.util.*; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidNoDoiTest { @@ -33,100 +39,10 @@ public class OrcidNoDoiTest { String nameB = "K"; String surnameB = "Abdel-Dayem"; String orcidIdA = "0000-0003-2760-1191"; - Double threshold = 0.8; @Test @Ignore - private void similarityTest() throws Exception { - logger.info("running testSimilarity ...."); - logger - .info( - "JaroWinklerSimilarity: " - + Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB))); - logger - .info( - "JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB))); - } - - @Test - @Ignore - private void bestMatchTest() throws Exception { - logger.info("running bestMatchTest ...."); - String contributor = surnameB + ", " + nameB; - logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor))); - } - - private static Double bestMatch(String authorSurname, String authorName, String contributor) { - logger.debug(authorSurname + " " + authorName + " vs " + contributor); - String[] contributorSplitted = contributor.split(" "); - if (contributorSplitted.length == 0) { - return 0.0; - } - final String contributorName = contributorSplitted[contributorSplitted.length - 1]; - String contributorSurname = ""; - if (contributorSplitted.length > 1) { - StringJoiner joiner = new StringJoiner(" "); - for (int i = 0; i < contributorSplitted.length - 1; i++) { - joiner.add(contributorSplitted[i]); - } - contributorSurname = joiner.toString(); - } - logger - .debug( - "contributorName: " + contributorName + - " contributorSurname: " + contributorSurname); - String authorNameNrm = normalize(authorName); - String authorSurnameNrm = normalize(authorSurname); - String contributorNameNrm = normalize(contributorName); - String contributorSurnameNrm = normalize(contributorSurname); - Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); - Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); - if (sm1.compareTo(sm2) >= 0) { - return sm1; - } - return sm2; - } - - private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { - Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); - logger - .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); - return score; - } - - private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) { - return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); - } - - private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { - return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); - } - - private static String parse(String name, String surname) { - return surname + " " + name; - } - - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - - @Test - @Ignore - public void readPublicationFieldsTest() + private void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); String xml = IOUtils @@ -178,78 +94,10 @@ public class OrcidNoDoiTest { } - private void updateRanks(List contributors) { - boolean seqFound = false; - if (contributors - .stream() - .filter( - c -> c.getRole() != null && c.getSequence() != null && - c.getRole().equals("author") && (c.getSequence().equals("first") || - c.getSequence().equals("additional"))) - .count() > 0) { - seqFound = true; - logger.info("sequence data found"); - } - if (!seqFound) { - List seqIds = Arrays.asList(0); - contributors.forEach(c -> { - int currentSeq = seqIds.get(0) + 1; - seqIds.set(0, currentSeq); - c.setSequence(Integer.toString(seqIds.get(0))); - }); - } - } - - private void updateAuthorsSimpleMatch(List contributors, AuthorData author) { - contributors.forEach(c -> { - if (c.isSimpleMatch()) { - logger.info("simple match on : " + c.getCreditName()); - c.setName(author.getName()); - c.setSurname(author.getSurname()); - c.setOid(author.getOid()); - } - }); - updateRanks(contributors); - } - - private void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { - logger.info("inside updateAuthorsSimilarityMatch ..."); - contributors.forEach(c -> { - logger - .info( - c.getOid() + " - " + c.getCreditName() + " - " + - c.getName() + " - " + c.getSurname() + " - " + - c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " - + c.isSimpleMatch()); - }); - - contributors - .stream() - .filter(c -> c.isBestMatch()) - .forEach(c -> { - logger.info("similarity match on : " + c.getCreditName()); - c.setName(author.getName()); - c.setSurname(author.getSurname()); - c.setOid(author.getOid()); - }); - updateRanks(contributors); - } - @Test - @Ignore - public void authorSimilarityMatchTest() throws Exception { - logger.info("running authorSimilarityMatchTest ...."); - authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml"); - } - - @Test - private void authorSimpleMatchTest() throws Exception { + public void authorMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); - authorMatchTest("activity_work_0000-0003-2760-1191.xml"); - } - - private void authorMatchTest(String orcidWork) - throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); author.setName(nameA); author.setSurname(surnameA); @@ -272,55 +120,9 @@ public class OrcidNoDoiTest { logger.error("parsing xml", e); } assertNotNull(workData); - int matchCounter = 0; - List matchCounters = Arrays.asList(matchCounter); - Contributor contributor = null; - workData.getContributors().forEach(c -> { - if (normalize(c.getCreditName()).contains(normalize(author.getName())) || - normalize(c.getCreditName()).contains(normalize(author.getSurname())) || - ((author.getOtherName() != null) - && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { - matchCounters.set(0, matchCounters.get(0) + 1); - c.setSimpleMatch(true); - } - }); - logger.info("match counter: " + Integer.toString(matchCounters.get(0))); - if (matchCounters.get(0) == 1) { - updateAuthorsSimpleMatch(workData.getContributors(), author); - } else if (matchCounters.get(0) > 1) { - Optional optCon = workData - .getContributors() - .stream() - .filter(c -> c.isSimpleMatch()) - .map(c -> { - c.setScore(bestMatch(nameA, surnameA, c.getCreditName())); - logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); - return c; - }) - .filter(c -> c.getScore() >= threshold) - .max(Comparator.comparing(c -> c.getScore())); - Contributor bestMatchContributor = null; - if (optCon.isPresent()) { - bestMatchContributor = optCon.get(); - bestMatchContributor.setBestMatch(true); - logger.info("best match: " + bestMatchContributor.getCreditName()); - updateAuthorsSimilarityMatch(workData.getContributors(), author); - } - - } - - logger.info("UPDATED contributors: "); - workData.getContributors().forEach(c -> { - logger - .info( - c.getOid() + " - " + c.getCreditName() + " - " + - c.getName() + " - " + c.getSurname() + " - " + - c.getRole() + " - " + c.getSequence()); - }); + AuthorMatcher.match(author, workData.getContributors()); + GsonBuilder builder = new GsonBuilder(); + Gson gson = builder.create(); + logger.info(gson.toJson(workData)); } } - -// -// orcid_RDD = sc.textFile(ORCID_DUMP_PATH) -// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec") -// \ No newline at end of file From b2213b6435dd3180adff6a7546e9f03337e8056c Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 26 Jun 2020 17:27:34 +0200 Subject: [PATCH 03/34] merged with dnet version --- .../orcid/ActivitiesDecompressor.java | 2 +- .../doiboost/orcid/SummariesDecompressor.java | 2 +- .../doiboost/orcid/json/JsonHelper.java | 1 + .../orcidnodoi/ActivitiesDumpReader.java | 30 +- .../orcidnodoi/GenOrcidAuthorWork.java | 7 +- .../SparkGenEnrichedOrcidWorks.java | 59 +-- .../orcidnodoi/proto/ProtoWriter.java | 427 ++++++++++++++++++ .../oozie_app/workflow.xml | 2 +- 8 files changed, 483 insertions(+), 47 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 80ccd71a1..02d2b267b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -19,9 +19,9 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; public class ActivitiesDecompressor { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index 603bfedf6..29d72ed0b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -19,9 +19,9 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; public class SummariesDecompressor { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java index 13a3cee8f..bfd6f7447 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java @@ -3,6 +3,7 @@ package eu.dnetlib.doiboost.orcid.json; import com.google.gson.Gson; import com.google.gson.JsonObject; + import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; public class JsonHelper { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index 7eb6faf54..506641b81 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -1,10 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi; -import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URI; + import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.hadoop.conf.Configuration; @@ -17,11 +19,10 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URI; +import eu.dnetlib.doiboost.orcid.json.JsonHelper; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; public class ActivitiesDumpReader { @@ -82,7 +83,8 @@ public class ActivitiesDumpReader { while ((line = br.readLine()) != null) { buffer.append(line); } - WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes()); + WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi + .VTDParseWorkData(buffer.toString().getBytes()); if (workDataNoDoi != null) { if (workDataNoDoi.getErrorCode() != null) { errorFromOrcidFound += 1; @@ -94,9 +96,11 @@ public class ActivitiesDumpReader { + entry.getName()); continue; } - boolean isDoiFound = workDataNoDoi.getExtIds().stream() - .filter(e -> e.getType()!=null) - .anyMatch(e -> e.getType().equals("doi")); + boolean isDoiFound = workDataNoDoi + .getExtIds() + .stream() + .filter(e -> e.getType() != null) + .anyMatch(e -> e.getType().equals("doi")); if (!isDoiFound) { String jsonData = JsonHelper.createOidWork(workDataNoDoi); Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index b82f4bc4c..bbaa5acca 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -1,15 +1,16 @@ package eu.dnetlib.doiboost.orcidnodoi; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.doiboost.orcid.OrcidDSManager; +import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.mortbay.log.Log; -import java.io.IOException; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.OrcidDSManager; public class GenOrcidAuthorWork extends OrcidDSManager { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 6bb31bcf6..9d9c5bc4a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -1,13 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi; -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.util.Objects; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -19,14 +18,17 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; -import java.io.IOException; -import java.util.Objects; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class SparkGenEnrichedOrcidWorks { public static void main(String[] args) throws IOException, Exception { @@ -67,27 +69,28 @@ public class SparkGenEnrichedOrcidWorks { Encoders.bean(AuthorData.class)); JavaPairRDD activitiesRDD = sc - .sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class); + .sequenceFile(workingPath + outputWorksPath + "works_X.seq", Text.class, Text.class); Dataset activitiesDataset = spark .createDataset( activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), Encoders.bean(WorkDataNoDoi.class)); activitiesDataset - .joinWith( - summariesDataset, - activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") - .map( - (MapFunction, Tuple2>) value -> { - WorkDataNoDoi w = value._1; - AuthorData a = value._2; - AuthorMatcher.match(a, w.getContributors()); - return new Tuple2<>(a.getOid(), w); - }, - Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) - .filter(Objects::nonNull) - .toJavaRDD() - .saveAsTextFile(workingPath + outputEnrichedWorksPath);; + .joinWith( + summariesDataset, + activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") + .map( + (MapFunction, Tuple2>) value -> { + WorkDataNoDoi w = value._1; + AuthorData a = value._2; + AuthorMatcher.match(a, w.getContributors()); + return new Tuple2<>(a.getOid(), w); + }, + Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) + .filter(Objects::nonNull) + .toJavaRDD() + .saveAsTextFile(workingPath + outputEnrichedWorksPath); + ; }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java new file mode 100644 index 000000000..01b172359 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java @@ -0,0 +1,427 @@ + +package eu.dnetlib.doiboost.orcidnodoi.proto; + +public class ProtoWriter { + +} +// +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate; +// +//import java.io.IOException; +//import java.io.InputStream; +//import java.util.ArrayList; +//import java.util.HashMap; +//import java.util.List; +//import java.util.Map; +// +//import org.apache.commons.io.IOUtils; +//import org.apache.commons.lang3.StringUtils; +// +//import com.google.gson.Gson; +//import com.google.gson.JsonArray; +//import com.google.gson.JsonElement; +//import com.google.gson.JsonObject; +//import com.googlecode.protobuf.format.JsonFormat; +// +//import eu.dnetlib.actionmanager.actions.ActionFactory; +//import eu.dnetlib.actionmanager.actions.AtomicAction; +//import eu.dnetlib.actionmanager.common.Agent; +//import eu.dnetlib.data.mapreduce.hbase.Reporter; +//import eu.dnetlib.data.mapreduce.util.StreamUtils; +//import eu.dnetlib.data.proto.FieldTypeProtos; +//import eu.dnetlib.data.proto.FieldTypeProtos.Author; +//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo; +//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; +//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; +//import eu.dnetlib.data.proto.FieldTypeProtos.StringField; +//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; +//import eu.dnetlib.data.proto.KindProtos; +//import eu.dnetlib.data.proto.OafProtos; +//import eu.dnetlib.data.proto.ResultProtos; +//import eu.dnetlib.data.proto.TypeProtos; +//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions; +//import eu.dnetlib.miscutils.collections.Pair; +//import eu.dnetlib.miscutils.datetime.DateUtils; +//import eu.dnetlib.pace.model.Person; +// +//public class ProtoWriter { +// +// public static final String ORCID = "ORCID"; +// public final static String orcidPREFIX = "orcid_______"; +// public static final String OPENAIRE_PREFIX = "openaire____"; +// public static final String SEPARATOR = "::"; +// +// private static Map> datasources = new HashMap>() { +// +// { +// put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); +// +// } +// }; +// +// // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname +// private static Map> externalIds = new HashMap>() { +// +// { +// put("ark".toLowerCase(), new Pair<>("ark", "ark")); +// put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); +// put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); +// put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); +// put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); +// put("urn".toLowerCase(), new Pair<>("urn", "urn")); +// } +// }; +// +// static Map> typologiesMapping; +// +// static { +// try { +// final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json"); +// final String tt = IOUtils.toString(is); +// typologiesMapping = new Gson().fromJson(tt, Map.class); +// } catch (final IOException e) { +// e.printStackTrace(); +// } +// } +// +// public static final String PID_TYPES = "dnet:pid_types"; +// +// public static List generatePublicationActionsFromDump(final JsonObject rootElement, +// final ActionFactory factory, +// final String setName, +// final Agent agent, +// final Reporter context) { +// +// if (!isValid(rootElement, context)) { return null; } +// +// // Create OAF proto +// +// final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder(); +// +// oaf.setDataInfo( +// DataInfo.newBuilder() +// .setDeletedbyinference(false) +// .setInferred(false) +// .setTrust("0.9") +// .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions")) +// .build()); +// +// // Adding kind +// oaf.setKind(KindProtos.Kind.entity); +// +// oaf.setLastupdatetimestamp(DateUtils.now()); +// +// // creating result proto +// final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result); +// +// entity.setDateofcollection("2018-10-22"); +// entity.setDateoftransformation(DateUtils.now_ISO8601()); +// +// // Adding external ids +// StreamUtils.toStream(externalIds.keySet().iterator()) +// .forEach(jsonExtId -> { +// final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); +// final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); +// final String extId = getStringValue(rootElement, jsonExtId); +// if (StringUtils.isNotBlank(extId)) { +// entity.addPid(StructuredProperty.newBuilder() +// .setValue(extId) +// .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types") +// .setSchemename("dnet:pid_types").build()) +// .build()); +// } +// }); +// +// // Create result field +// final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder(); +// +// // Create metadata proto +// final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); +// +// // Adding source +// final String source = getStringValue(rootElement, "source"); +// if (StringUtils.isNotBlank(source)) { +// metadata.addSource(StringField.newBuilder().setValue(source).build()); +// } +// +// // Adding title +// final String title = createRepeatedField(rootElement, "titles"); +// if (StringUtils.isBlank(title)) { +// context.incrementCounter("filtered", "title_not_found", 1); +// return null; +// } +// metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder() +// .setValue(title) +// .setQualifier(getQualifier("main title", "dnet:dataCite_title")) +// .build()); +// +// // Adding identifier +// final String id = getStringValue(rootElement, "id"); +// String sourceId = null; +// if (id != null) { +// entity.addOriginalId(id); +// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id)); +// } else { +// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title)); +// } +// entity.setId(sourceId); +// +// // Adding relevant date +// settingRelevantDate(rootElement, metadata, "publication_date", "issued", true); +// +// // Adding collectedfrom +// final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() +// .setValue(ORCID) +// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a") +// .build(); +// entity.addCollectedfrom(collectedFrom); +// +// // Adding type +// final String type = getStringValue(rootElement, "type"); +// String cobjValue = ""; +// if (StringUtils.isNotBlank(type)) { +// +// metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder() +// .setClassid(type) +// .setClassname(type) +// .setSchemeid("dnet:dataCite_resource") +// .setSchemename("dnet:dataCite_resource") +// .build()); +// +// final String typeValue = typologiesMapping.get(type).get("value"); +// cobjValue = typologiesMapping.get(type).get("cobj"); +// final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder(); +// +// // Adding hostedby +// instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder() +// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c") +// .setValue("Unknown Repository") +// .build()); +// +// // Adding url +// final String url = createRepeatedField(rootElement, "urls"); +// if (StringUtils.isNotBlank(url)) { +// instance.addUrl(url); +// } +// +// final String pubDate = getPublicationDate(rootElement, "publication_date"); +// if (StringUtils.isNotBlank(pubDate)) { +// instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); +// } +// +// instance.setCollectedfrom(collectedFrom); +// +// // Adding accessright +// instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder() +// .setClassid("UNKNOWN") +// .setClassname("UNKNOWN") +// .setSchemeid("dnet:access_modes") +// .setSchemename("dnet:access_modes") +// .build()); +// +// // Adding type +// instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() +// .setClassid(cobjValue) +// .setClassname(typeValue) +// .setSchemeid("dnet:publication_resource") +// .setSchemename("dnet:publication_resource") +// .build()); +// +// result.addInstance(instance); +// } else { +// context.incrementCounter("filtered", "type_not_found", 1); +// return null; +// } +// +// // Adding authors +// final List authors = createAuthors(rootElement); +// if (authors != null && authors.size() > 0) { +// metadata.addAllAuthor(authors); +// } else { +// context.incrementCounter("filtered", "author_not_found", 1); +// return null; +// } +// +// metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies")); +// result.setMetadata(metadata.build()); +// entity.setResult(result.build()); +// oaf.setEntity(entity.build()); +// +// final List actionList = new ArrayList<>(); +// +// actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray())); +// +//// System.out.println(JsonFormat.printToString(oaf.build())); +// return actionList; +// +// } +// +// public static List createAuthors(final JsonObject root) { +// +// final String authorsJSONFieldName = "authors"; +// +// if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { +// +// final List authors = new ArrayList<>(); +// final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); +// int firstCounter = 0; +// int defaultCounter = 0; +// int rank = 1; +// int currentRank = 0; +// +// for (final JsonElement item : jsonAuthors) { +// final JsonObject author = item.getAsJsonObject(); +// final Author.Builder result = Author.newBuilder(); +// if (item.isJsonObject()) { +// final String surname = getStringValue(author, "surname"); +// final String name = getStringValue(author, "name"); +// final String oid = getStringValue(author, "oid"); +// final String seq = getStringValue(author, "seq"); +// if (StringUtils.isNotBlank(seq)) { +// if (seq.equals("first")) { +// firstCounter += 1; +// rank = firstCounter; +// +// } else if (seq.equals("additional")) { +// rank = currentRank + 1; +// } else { +// defaultCounter += 1; +// rank = defaultCounter; +// } +// } +// +// if (StringUtils.isNotBlank(oid)) { +// result.addPid(KeyValue.newBuilder() +// .setValue(oid) +// .setKey("ORCID") +// .build()); +// result.setFullname(name + " " + surname); +// if (StringUtils.isNotBlank(name)) { +// result.setName(name); +// } +// if (StringUtils.isNotBlank(surname)) { +// result.setSurname(surname); +// } +// } else { +// String fullname = ""; +// if (StringUtils.isNotBlank(name)) { +// fullname = name; +// } else { +// if (StringUtils.isNotBlank(surname)) { +// fullname = surname; +// } +// } +// Person p = new Person(fullname, false); +// if (p.isAccurate()) { +// result.setName(p.getNormalisedFirstName()); +// result.setSurname(p.getNormalisedSurname()); +// result.setFullname(p.getNormalisedFullname()); +// } +// else { +// result.setFullname(fullname); +// } +// } +// } +// result.setRank(rank); +// authors.add(result.build()); +// currentRank = rank; +// } +// return authors; +// +// } +// return null; +// } +// +// private static String createRepeatedField(final JsonObject rootElement, final String fieldName) { +// String field = ""; +// if (!rootElement.has(fieldName)) { return null; } +// if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } +// if (rootElement.get(fieldName).isJsonArray()) { +// if (!isValidJsonArray(rootElement, fieldName)) { return null; } +// final StringBuilder ttl = new StringBuilder(); +// getArrayValues(rootElement, fieldName).forEach(ttl::append); +// field = ttl.toString(); +// } else { +// field = getStringValue(rootElement, fieldName); +// } +// +// if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') { +// field = field.substring(1, field.length() - 1); +// } +// return field; +// } +// +// private static void settingRelevantDate(final JsonObject rootElement, +// final ResultProtos.Result.Metadata.Builder metadata, +// final String jsonKey, +// final String dictionaryKey, +// final boolean addToDateOfAcceptance) { +// +// final String pubDate = getPublicationDate(rootElement, "publication_date"); +// if (StringUtils.isNotBlank(pubDate)) { +// if (addToDateOfAcceptance) { +// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); +// } +// metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder() +// .setValue(pubDate) +// .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date")) +// .build()); +// } +// } +// +// private static String getPublicationDate(final JsonObject rootElement, +// final String jsonKey) { +// +// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); +// if (pubDateJson == null) { return null; } +// final String year = getStringValue(pubDateJson, "year"); +// final String month = getStringValue(pubDateJson, "month"); +// final String day = getStringValue(pubDateJson, "day"); +// +// if (StringUtils.isBlank(year)) { return null; } +// String pubDate = "".concat(year); +// if (StringUtils.isNotBlank(month)) { +// pubDate = pubDate.concat("-" + month); +// if (StringUtils.isNotBlank(day)) { +// pubDate = pubDate.concat("-" + day); +// } else { +// pubDate += "-01"; +// } +// } else { +// pubDate += "-01-01"; +// } +// if (isValidDate(pubDate)) { return pubDate; } +// return null; +// } +// +// protected static boolean isValid(final JsonObject rootElement, final Reporter context) { +// +// final String type = getStringValue(rootElement, "type"); +// if (!typologiesMapping.containsKey(type)) { +// context.incrementCounter("filtered", "unknowntype_" + type, 1); +// return false; +// } +// +// if (!isValidJsonArray(rootElement, "titles")) { +// context.incrementCounter("filtered", "invalid_title", 1); +// return false; +// } +// return true; +// } +// +// private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { +// if (!rootElement.has(fieldName)) { return false; } +// final JsonElement jsonElement = rootElement.get(fieldName); +// if (jsonElement.isJsonNull()) { return false; } +// if (jsonElement.isJsonArray()) { +// final JsonArray jsonArray = jsonElement.getAsJsonArray(); +// if (jsonArray.isJsonNull()) { return false; } +// if (jsonArray.get(0).isJsonNull()) { return false; } +// } +// return true; +// } +//} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml index 2486bdb24..33fbdf875 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -509,7 +509,7 @@ cluster Gen_Enriched_Orcid_Works eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks - dhp-doiboost-1.2.3-SNAPSHOT.jar + dhp-doiboost-1.2.2-SNAPSHOT.jar --num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} -w${workingPath}/ From b7b6be12a51c81b2b7469684cf18bc8a3014aec4 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 29 Jun 2020 18:03:16 +0200 Subject: [PATCH 04/34] fixed enriched works generation --- .../doiboost/orcid/json/JsonHelper.java | 6 +-- .../orcidnodoi/ActivitiesDumpReader.java | 4 +- .../orcidnodoi/GenOrcidAuthorWork.java | 1 + .../SparkGenEnrichedOrcidWorks.java | 29 +++++------ .../orcidnodoi/similarity/AuthorMatcher.java | 48 +++++-------------- .../orcidnodoi/xml/XMLRecordParserNoDoi.java | 4 +- .../oozie_app/config-default.xml | 17 +++++-- .../oozie_app/workflow.xml | 24 +++++++--- 8 files changed, 66 insertions(+), 67 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java index bfd6f7447..94f7d8c91 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java @@ -2,16 +2,12 @@ package eu.dnetlib.doiboost.orcid.json; import com.google.gson.Gson; -import com.google.gson.JsonObject; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; public class JsonHelper { public static String createOidWork(WorkDataNoDoi workData) { - JsonObject oidWork = new JsonObject(); - oidWork.addProperty("oid", workData.getOid()); - oidWork.addProperty("work", new Gson().toJson(workData)); - return oidWork.toString(); + return new Gson().toJson(workData); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index 506641b81..bf63568d8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; public class ActivitiesDumpReader { - private static final int MAX_XML_WORKS_PARSED = -1; - private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; + private static final int MAX_XML_WORKS_PARSED = 100; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10; public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) throws Exception { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index bbaa5acca..8dcee796c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -45,6 +45,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager { Log.info("HDFS URI: " + hdfsServerUri); workingPath = parser.get("workingPath"); Log.info("Working Path: " + workingPath); + hdfsOrcidDefaultPath = workingPath; activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); Log.info("Activities File Name: " + activitiesFileNameTarGz); outputWorksPath = parser.get("outputWorksPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 9d9c5bc4a..ae1e4dae6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -24,6 +24,7 @@ import com.google.gson.JsonElement; import com.google.gson.JsonParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; @@ -31,9 +32,9 @@ import scala.Tuple2; public class SparkGenEnrichedOrcidWorks { + static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); + public static void main(String[] args) throws IOException, Exception { - Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); - logger.info("[ SparkGenerateDoiAuthorList STARTED]"); final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -46,13 +47,9 @@ public class SparkGenEnrichedOrcidWorks { .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); - logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String workingPath = parser.get("workingPath"); - logger.info("workingPath: ", workingPath); final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath"); - logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath); final String outputWorksPath = parser.get("outputWorksPath"); - logger.info("outputWorksPath: ", outputWorksPath); SparkConf conf = new SparkConf(); runWithSparkSession( @@ -67,30 +64,33 @@ public class SparkGenEnrichedOrcidWorks { .createDataset( summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), Encoders.bean(AuthorData.class)); + logger.info("Authors data loaded: " + summariesDataset.count()); JavaPairRDD activitiesRDD = sc - .sequenceFile(workingPath + outputWorksPath + "works_X.seq", Text.class, Text.class); + .sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class); Dataset activitiesDataset = spark .createDataset( activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), Encoders.bean(WorkDataNoDoi.class)); + logger.info("Works data loaded: " + activitiesDataset.count()); - activitiesDataset + JavaRDD> enrichedWorksRDD = activitiesDataset .joinWith( summariesDataset, activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") .map( - (MapFunction, Tuple2>) value -> { + (MapFunction, Tuple2>) value -> { WorkDataNoDoi w = value._1; AuthorData a = value._2; AuthorMatcher.match(a, w.getContributors()); - return new Tuple2<>(a.getOid(), w); + return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w)); }, - Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) - .toJavaRDD() - .saveAsTextFile(workingPath + outputEnrichedWorksPath); - ; + .toJavaRDD(); + logger.info("Works enriched data created: " + enrichedWorksRDD.count()); + enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath); + logger.info("Works enriched data saved"); }); } @@ -105,6 +105,7 @@ public class SparkGenEnrichedOrcidWorks { } private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) { + WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class); return workData; } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 09fd8b36b..1e4c38bef 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -33,15 +33,13 @@ public class AuthorMatcher { List matchCounters = Arrays.asList(matchCounter); Contributor contributor = null; contributors.forEach(c -> { - if (normalize(c.getCreditName()).contains(normalize(author.getName())) || - normalize(c.getCreditName()).contains(normalize(author.getSurname())) || - ((author.getOtherName() != null) - && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { + if (simpleMatch(c.getCreditName(), author.getName()) || + simpleMatch(c.getCreditName(), author.getSurname()) || + simpleMatch(c.getCreditName(), author.getOtherName())) { matchCounters.set(0, matchCounters.get(0) + 1); c.setSimpleMatch(true); } }); - logger.info("match counter: " + Integer.toString(matchCounters.get(0))); if (matchCounters.get(0) == 1) { updateAuthorsSimpleMatch(contributors, author); } else if (matchCounters.get(0) > 1) { @@ -50,7 +48,6 @@ public class AuthorMatcher { .filter(c -> c.isSimpleMatch()) .map(c -> { c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); - logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); return c; }) .filter(c -> c.getScore() >= threshold) @@ -59,24 +56,21 @@ public class AuthorMatcher { if (optCon.isPresent()) { bestMatchContributor = optCon.get(); bestMatchContributor.setBestMatch(true); - logger.info("best match: " + bestMatchContributor.getCreditName()); updateAuthorsSimilarityMatch(contributors, author); } } - logger.info("UPDATED contributors: "); - contributors.forEach(c -> { - logger - .info( - c.getOid() + " - " + c.getCreditName() + " - " + - c.getName() + " - " + c.getSurname() + " - " + - c.getRole() + " - " + c.getSequence()); - }); + } + + private static boolean simpleMatch(String name, String searchValue) { + if (searchValue == null) { + return false; + } + return normalize(name).contains(normalize(searchValue)); } private static Double bestMatch(String authorSurname, String authorName, String contributor) { - logger.debug(authorSurname + " " + authorName + " vs " + contributor); String[] contributorSplitted = contributor.split(" "); if (contributorSplitted.length == 0) { return 0.0; @@ -90,10 +84,6 @@ public class AuthorMatcher { } contributorSurname = joiner.toString(); } - logger - .debug( - "contributorName: " + contributorName + - " contributorSurname: " + contributorSurname); String authorNameNrm = normalize(authorName); String authorSurnameNrm = normalize(authorSurname); String contributorNameNrm = normalize(contributorName); @@ -108,8 +98,6 @@ public class AuthorMatcher { private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); - logger - .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); return score; } @@ -118,6 +106,9 @@ public class AuthorMatcher { } private static String normalize(final String s) { + if (s == null) { + return new String(""); + } return nfd(s) .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError @@ -142,7 +133,6 @@ public class AuthorMatcher { private static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { contributors.forEach(c -> { if (c.isSimpleMatch()) { - logger.info("simple match on : " + c.getCreditName()); c.setName(author.getName()); c.setSurname(author.getSurname()); c.setOid(author.getOid()); @@ -152,21 +142,10 @@ public class AuthorMatcher { } private static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { - logger.info("inside updateAuthorsSimilarityMatch ..."); - contributors.forEach(c -> { - logger - .info( - c.getOid() + " - " + c.getCreditName() + " - " + - c.getName() + " - " + c.getSurname() + " - " + - c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " - + c.isSimpleMatch()); - }); - contributors .stream() .filter(c -> c.isBestMatch()) .forEach(c -> { - logger.info("similarity match on : " + c.getCreditName()); c.setName(author.getName()); c.setSurname(author.getSurname()); c.setOid(author.getOid()); @@ -184,7 +163,6 @@ public class AuthorMatcher { c.getSequence().equals("additional"))) .count() > 0) { seqFound = true; - logger.info("sequence data found"); } if (!seqFound) { List seqIds = Arrays.asList(0); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java index 6e5771547..ae96a322f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -41,7 +41,6 @@ public class XMLRecordParserNoDoi { public static WorkDataNoDoi VTDParseWorkData(byte[] bytes) throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException, NavException, XPathEvalException { - logger.info("parsing xml ..."); final VTDGen vg = new VTDGen(); vg.setDoc(bytes); vg.parse(true); @@ -191,6 +190,9 @@ public class XMLRecordParserNoDoi { nameIndex++; } } + if (contributors.size() == 0) { + return contributors; + } int sequenceIndex = 0; ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml index f2d51e260..3068562d0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml @@ -8,15 +8,24 @@ true - oozie.launcher.mapreduce.map.java.opts - -Xmx4g + oozie.launcher.mapreduce.map.java.opts + -Xmx4g jobTracker - hadoop-rm3.garr-pa1.d4science.org:8032 + yarnRM nameNode - hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 + hdfs://nameservice1 + + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml index 33fbdf875..df5e0e76f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -71,10 +71,9 @@ the shell command that downloads and puts to hdfs orcid activity file X - + - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -133,6 +132,7 @@ -n${nameNode} -fORCID_2019_activites_0.tar.gz -owno_doi_works/works_0.seq + -oewno_doi_enriched_works/ @@ -169,6 +169,7 @@ -n${nameNode} -fORCID_2019_activites_1.tar.gz -owno_doi_works/works_1.seq + -oewno_doi_enriched_works/ @@ -205,6 +206,7 @@ -n${nameNode} -fORCID_2019_activites_2.tar.gz -owno_doi_works/works_2.seq + -oewno_doi_enriched_works/ @@ -241,6 +243,7 @@ -n${nameNode} -fORCID_2019_activites_3.tar.gz -owno_doi_works/works_3.seq + -oewno_doi_enriched_works/ @@ -277,6 +280,7 @@ -n${nameNode} -fORCID_2019_activites_4.tar.gz -owno_doi_works/works_4.seq + -oewno_doi_enriched_works/ @@ -313,6 +317,7 @@ -n${nameNode} -fORCID_2019_activites_5.tar.gz -owno_doi_works/works_5.seq + -oewno_doi_enriched_works/ @@ -349,6 +354,7 @@ -n${nameNode} -fORCID_2019_activites_6.tar.gz -owno_doi_works/works_6.seq + -oewno_doi_enriched_works/ @@ -386,6 +392,7 @@ -n${nameNode} -fORCID_2019_activites_7.tar.gz -owno_doi_works/works_7.seq + -oewno_doi_enriched_works/ @@ -422,6 +429,7 @@ -n${nameNode} -fORCID_2019_activites_8.tar.gz -owno_doi_works/works_8.seq + -oewno_doi_enriched_works/ @@ -458,6 +466,7 @@ -n${nameNode} -fORCID_2019_activites_9.tar.gz -owno_doi_works/works_9.seq + -oewno_doi_enriched_works/ @@ -494,11 +503,12 @@ -n${nameNode} -fORCID_2019_activites_X.tar.gz -owno_doi_works/works_X.seq + -oewno_doi_enriched_works/ - + @@ -509,12 +519,14 @@ cluster Gen_Enriched_Orcid_Works eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks - dhp-doiboost-1.2.2-SNAPSHOT.jar + dhp-doiboost-1.2.4-SNAPSHOT.jar --num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} -w${workingPath}/ + -n${nameNode} + -f- -owno_doi_works/ - -oewno_doi_enriched_works/ + -oewno_doi_enriched_works/output From 5525f57ec8f9ef07d74ab30c54ab8d39e924d413 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 1 Jul 2020 18:36:14 +0200 Subject: [PATCH 05/34] converter from orcid work json to oaf --- .../orcidnodoi/oaf/OrcidWorkToOAF.java | 420 +++++++++++++++++ .../orcidnodoi/proto/ProtoWriter.java | 427 ------------------ .../orcidnodoi/util/DumpToActionsUtility.java | 107 +++++ .../doiboost/orcidnodoi/util/Pair.java | 30 ++ .../orcidnodoi/mappings/typologies.json | 41 ++ 5 files changed, 598 insertions(+), 427 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java new file mode 100644 index 000000000..673abb407 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java @@ -0,0 +1,420 @@ + +package eu.dnetlib.doiboost.orcidnodoi.oaf; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import eu.dnetlib.dhp.common.PacePerson; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks; +import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; +import eu.dnetlib.doiboost.orcidnodoi.util.Pair; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.util.stream.Collectors; + +import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*; + +public class OrcidWorkToOAF { + + static Logger logger = LoggerFactory.getLogger(OrcidWorkToOAF.class); + + public static final String ORCID = "ORCID"; + public final static String orcidPREFIX = "orcid_______"; + public static final String OPENAIRE_PREFIX = "openaire____"; + public static final String SEPARATOR = "::"; + + private static Map> datasources = new HashMap>() { + + { + put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); + + } + }; + + // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname + private static Map> externalIds = new HashMap>() { + + { + put("ark".toLowerCase(), new Pair<>("ark", "ark")); + put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); + put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); + put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); + put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); + put("urn".toLowerCase(), new Pair<>("urn", "urn")); + } + }; + + static Map> typologiesMapping; + + static { + try { + final String tt = IOUtils.toString(OrcidWorkToOAF.class.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json")); + typologiesMapping = new Gson().fromJson(tt, Map.class); + } catch (final Exception e) { + logger.error("loading typologies", e); + } + } + + public static final String PID_TYPES = "dnet:pid_types"; + + public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement, final String setName) { + + if (!isValid(rootElement/*, context*/)) { return null; } + + Publication publication = new Publication(); + + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferred(false); + dataInfo.setTrust("0.9"); + dataInfo.setProvenanceaction( + mapQualifier( + "sysimport:actionset:orcidworks-no-doi", + "sysimport:actionset:orcidworks-no-doi", + "dnet:provenanceActions", + "dnet:provenanceActions")); + publication.setDataInfo(dataInfo); + + publication.setLastupdatetimestamp(new Date().getTime()); + + publication.setDateofcollection("2019-10-22"); + publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); + + // Adding external ids + externalIds.keySet().stream() + .forEach(jsonExtId -> { + final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); + final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); + final String extId = getStringValue(rootElement, jsonExtId); + if (StringUtils.isNotBlank(extId)) { + publication.getExternalReference().add( + convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types")); + } + }); + + // Adding source +// final String source = getStringValue(rootElement, "source"); +// if (StringUtils.isNotBlank(source)) { +// metadata.addSource(StringField.newBuilder().setValue(source).build()); +// } + + // Adding titles + final List titles = createRepeatedField(rootElement, "titles"); + if (titles==null || titles.isEmpty()) { +// context.incrementCounter("filtered", "title_not_found", 1); + return null; + } + Qualifier q = mapQualifier("main title","main title","dnet:dataCite_title","dnet:dataCite_title"); + publication.setTitle( + titles + .stream() + .map(t -> { + return mapStructuredProperty(t, q, null); + }) + .collect(Collectors.toList())); + // Adding identifier + final String id = getStringValue(rootElement, "id"); + String sourceId = null; + if (id != null) { + publication.setOriginalId(Arrays.asList(id)); + sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase())); + } else { + String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(",")); + sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase())); + } + publication.setId(sourceId); + + // Adding relevant date + settingRelevantDate(rootElement, publication, "publication_date", "issued", true); + + // Adding collectedfrom + publication.setCollectedfrom(Arrays.asList(createCollectedFrom())); + + // Adding type + final String type = getStringValue(rootElement, "type"); + String cobjValue = ""; + if (StringUtils.isNotBlank(type)) { + publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource")); + + final String typeValue = typologiesMapping.get(type).get("value"); + cobjValue = typologiesMapping.get(type).get("cobj"); + final Instance instance = new Instance(); + + // Adding hostedby + instance.setHostedby(createHostedBy()); + + // Adding url + final List urls = createRepeatedField(rootElement, "urls"); + if (urls!=null && !urls.isEmpty()) { + instance.setUrl(urls); + } + + final String pubDate = getPublicationDate(rootElement, "publication_date"); + if (StringUtils.isNotBlank(pubDate)) { + instance.setDateofacceptance(mapStringField(pubDate, null)); + } + + instance.setCollectedfrom(createCollectedFrom()); + + // Adding accessright + instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes")); + + // Adding type + instance.setInstancetype(mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource")); + + publication.setInstance(Arrays.asList(instance)); + } else { +// context.incrementCounter("filtered", "type_not_found", 1); + return null; + } + + // Adding authors + final List authors = createAuthors(rootElement); + if (authors != null && authors.size() > 0) { + publication.setAuthor(authors); + } else { +// context.incrementCounter("filtered", "author_not_found", 1); + return null; + } + String classValue = getDefaultResulttype(cobjValue); + publication.setResulttype(mapQualifier(classValue, classValue,"dnet:result_typologies", "dnet:result_typologies")); + return publication; + } + + public static List createAuthors(final JsonObject root) { + + final String authorsJSONFieldName = "authors"; + + if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { + + final List authors = new ArrayList<>(); + final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); + int firstCounter = 0; + int defaultCounter = 0; + int rank = 1; + int currentRank = 0; + + for (final JsonElement item : jsonAuthors) { + final JsonObject jsonAuthor = item.getAsJsonObject(); + final Author author = new Author(); + if (item.isJsonObject()) { + final String surname = getStringValue(jsonAuthor, "surname"); + final String name = getStringValue(jsonAuthor, "name"); + final String oid = getStringValue(jsonAuthor, "oid"); + final String seq = getStringValue(jsonAuthor, "seq"); + if (StringUtils.isNotBlank(seq)) { + if (seq.equals("first")) { + firstCounter += 1; + rank = firstCounter; + + } else if (seq.equals("additional")) { + rank = currentRank + 1; + } else { + defaultCounter += 1; + rank = defaultCounter; + } + } + + if (StringUtils.isNotBlank(oid)) { + author.setPid(Arrays.asList(mapAuthorId(oid))); + author.setFullname(name + " " + surname); + if (StringUtils.isNotBlank(name)) { + author.setName(name); + } + if (StringUtils.isNotBlank(surname)) { + author.setSurname(surname); + } + } else { + String fullname = ""; + if (StringUtils.isNotBlank(name)) { + fullname = name; + } else { + if (StringUtils.isNotBlank(surname)) { + fullname = surname; + } + } + PacePerson p = new PacePerson(fullname, false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + author.setFullname(p.getNormalisedFullname()); + } + else { + author.setFullname(fullname); + } + } + } + author.setRank(rank); + authors.add(author); + currentRank = rank; + } + return authors; + + } + return null; + } + + private static List createRepeatedField(final JsonObject rootElement, final String fieldName) { + if (!rootElement.has(fieldName)) { return null; } + if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } + if (rootElement.get(fieldName).isJsonArray()) { + if (!isValidJsonArray(rootElement, fieldName)) { return null; } + return getArrayValues(rootElement, fieldName); + } else { + String field = getStringValue(rootElement, fieldName); + return Arrays.asList(cleanField(field)); + } + } + + private static String cleanField(String value) { + if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') { + value = value.substring(1, value.length() - 1); + } + return value; + } + + private static void settingRelevantDate(final JsonObject rootElement, + final Publication publication, + final String jsonKey, + final String dictionaryKey, + final boolean addToDateOfAcceptance) { + + final String pubDate = getPublicationDate(rootElement, "publication_date"); + if (StringUtils.isNotBlank(pubDate)) { + if (addToDateOfAcceptance) { + publication.setDateofacceptance(mapStringField(pubDate, null)); + } + Qualifier q = mapQualifier(dictionaryKey,dictionaryKey,"dnet:dataCite_date","dnet:dataCite_date"); + publication.setRelevantdate( + Arrays.asList(pubDate) + .stream() + .map(r -> { + return mapStructuredProperty(r, q, null); + }) + .collect(Collectors.toList())); + } + } + + private static String getPublicationDate(final JsonObject rootElement, + final String jsonKey) { + + final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); + if (pubDateJson == null) { return null; } + final String year = getStringValue(pubDateJson, "year"); + final String month = getStringValue(pubDateJson, "month"); + final String day = getStringValue(pubDateJson, "day"); + + if (StringUtils.isBlank(year)) { return null; } + String pubDate = "".concat(year); + if (StringUtils.isNotBlank(month)) { + pubDate = pubDate.concat("-" + month); + if (StringUtils.isNotBlank(day)) { + pubDate = pubDate.concat("-" + day); + } else { + pubDate += "-01"; + } + } else { + pubDate += "-01-01"; + } + if (isValidDate(pubDate)) { return pubDate; } + return null; + } + + protected static boolean isValid(final JsonObject rootElement/*, final Reporter context*/) { + + final String type = getStringValue(rootElement, "type"); + if (!typologiesMapping.containsKey(type)) { +// context.incrementCounter("filtered", "unknowntype_" + type, 1); + return false; + } + + if (!isValidJsonArray(rootElement, "titles")) { +// context.incrementCounter("filtered", "invalid_title", 1); + return false; + } + return true; + } + + private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { + if (!rootElement.has(fieldName)) { return false; } + final JsonElement jsonElement = rootElement.get(fieldName); + if (jsonElement.isJsonNull()) { return false; } + if (jsonElement.isJsonArray()) { + final JsonArray jsonArray = jsonElement.getAsJsonArray(); + if (jsonArray.isJsonNull()) { return false; } + if (jsonArray.get(0).isJsonNull()) { return false; } + } + return true; + } + + private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(classId); + qualifier.setClassname(className); + qualifier.setSchemeid(schemeId); + qualifier.setSchemename(schemeName); + return qualifier; + } + + private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) { + ExternalReference ex = new ExternalReference(); + ex.setRefidentifier(extId); + ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName )); + return ex; + } + + private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) { + if (value == null | StringUtils.isBlank(value)) { + return null; + } + + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(value); + structuredProperty.setQualifier(qualifier); + structuredProperty.setDataInfo(dataInfo); + return structuredProperty; + } + + private static Field mapStringField(String value, DataInfo dataInfo) { + if (value == null || StringUtils.isBlank(value)) { + return null; + } + + final Field stringField = new Field<>(); + stringField.setValue(value); + stringField.setDataInfo(dataInfo); + return stringField; + } + + private static KeyValue createCollectedFrom() { + KeyValue cf = new KeyValue(); + cf.setValue(ORCID); + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); + return cf; + } + + private static KeyValue createHostedBy() { + KeyValue hb = new KeyValue(); + hb.setValue("Unknown Repository"); + hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); + return hb; + } + + private static StructuredProperty mapAuthorId(String orcidId) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(orcidId); + final Qualifier q = new Qualifier(); + q.setClassid("ORCID"); + q.setClassname("ORCID"); + sp.setQualifier(q); + return sp; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java deleted file mode 100644 index 01b172359..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java +++ /dev/null @@ -1,427 +0,0 @@ - -package eu.dnetlib.doiboost.orcidnodoi.proto; - -public class ProtoWriter { - -} -// -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate; -// -//import java.io.IOException; -//import java.io.InputStream; -//import java.util.ArrayList; -//import java.util.HashMap; -//import java.util.List; -//import java.util.Map; -// -//import org.apache.commons.io.IOUtils; -//import org.apache.commons.lang3.StringUtils; -// -//import com.google.gson.Gson; -//import com.google.gson.JsonArray; -//import com.google.gson.JsonElement; -//import com.google.gson.JsonObject; -//import com.googlecode.protobuf.format.JsonFormat; -// -//import eu.dnetlib.actionmanager.actions.ActionFactory; -//import eu.dnetlib.actionmanager.actions.AtomicAction; -//import eu.dnetlib.actionmanager.common.Agent; -//import eu.dnetlib.data.mapreduce.hbase.Reporter; -//import eu.dnetlib.data.mapreduce.util.StreamUtils; -//import eu.dnetlib.data.proto.FieldTypeProtos; -//import eu.dnetlib.data.proto.FieldTypeProtos.Author; -//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo; -//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; -//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; -//import eu.dnetlib.data.proto.FieldTypeProtos.StringField; -//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; -//import eu.dnetlib.data.proto.KindProtos; -//import eu.dnetlib.data.proto.OafProtos; -//import eu.dnetlib.data.proto.ResultProtos; -//import eu.dnetlib.data.proto.TypeProtos; -//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions; -//import eu.dnetlib.miscutils.collections.Pair; -//import eu.dnetlib.miscutils.datetime.DateUtils; -//import eu.dnetlib.pace.model.Person; -// -//public class ProtoWriter { -// -// public static final String ORCID = "ORCID"; -// public final static String orcidPREFIX = "orcid_______"; -// public static final String OPENAIRE_PREFIX = "openaire____"; -// public static final String SEPARATOR = "::"; -// -// private static Map> datasources = new HashMap>() { -// -// { -// put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); -// -// } -// }; -// -// // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname -// private static Map> externalIds = new HashMap>() { -// -// { -// put("ark".toLowerCase(), new Pair<>("ark", "ark")); -// put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); -// put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); -// put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); -// put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); -// put("urn".toLowerCase(), new Pair<>("urn", "urn")); -// } -// }; -// -// static Map> typologiesMapping; -// -// static { -// try { -// final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json"); -// final String tt = IOUtils.toString(is); -// typologiesMapping = new Gson().fromJson(tt, Map.class); -// } catch (final IOException e) { -// e.printStackTrace(); -// } -// } -// -// public static final String PID_TYPES = "dnet:pid_types"; -// -// public static List generatePublicationActionsFromDump(final JsonObject rootElement, -// final ActionFactory factory, -// final String setName, -// final Agent agent, -// final Reporter context) { -// -// if (!isValid(rootElement, context)) { return null; } -// -// // Create OAF proto -// -// final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder(); -// -// oaf.setDataInfo( -// DataInfo.newBuilder() -// .setDeletedbyinference(false) -// .setInferred(false) -// .setTrust("0.9") -// .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions")) -// .build()); -// -// // Adding kind -// oaf.setKind(KindProtos.Kind.entity); -// -// oaf.setLastupdatetimestamp(DateUtils.now()); -// -// // creating result proto -// final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result); -// -// entity.setDateofcollection("2018-10-22"); -// entity.setDateoftransformation(DateUtils.now_ISO8601()); -// -// // Adding external ids -// StreamUtils.toStream(externalIds.keySet().iterator()) -// .forEach(jsonExtId -> { -// final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); -// final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); -// final String extId = getStringValue(rootElement, jsonExtId); -// if (StringUtils.isNotBlank(extId)) { -// entity.addPid(StructuredProperty.newBuilder() -// .setValue(extId) -// .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types") -// .setSchemename("dnet:pid_types").build()) -// .build()); -// } -// }); -// -// // Create result field -// final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder(); -// -// // Create metadata proto -// final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); -// -// // Adding source -// final String source = getStringValue(rootElement, "source"); -// if (StringUtils.isNotBlank(source)) { -// metadata.addSource(StringField.newBuilder().setValue(source).build()); -// } -// -// // Adding title -// final String title = createRepeatedField(rootElement, "titles"); -// if (StringUtils.isBlank(title)) { -// context.incrementCounter("filtered", "title_not_found", 1); -// return null; -// } -// metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder() -// .setValue(title) -// .setQualifier(getQualifier("main title", "dnet:dataCite_title")) -// .build()); -// -// // Adding identifier -// final String id = getStringValue(rootElement, "id"); -// String sourceId = null; -// if (id != null) { -// entity.addOriginalId(id); -// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id)); -// } else { -// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title)); -// } -// entity.setId(sourceId); -// -// // Adding relevant date -// settingRelevantDate(rootElement, metadata, "publication_date", "issued", true); -// -// // Adding collectedfrom -// final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() -// .setValue(ORCID) -// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a") -// .build(); -// entity.addCollectedfrom(collectedFrom); -// -// // Adding type -// final String type = getStringValue(rootElement, "type"); -// String cobjValue = ""; -// if (StringUtils.isNotBlank(type)) { -// -// metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder() -// .setClassid(type) -// .setClassname(type) -// .setSchemeid("dnet:dataCite_resource") -// .setSchemename("dnet:dataCite_resource") -// .build()); -// -// final String typeValue = typologiesMapping.get(type).get("value"); -// cobjValue = typologiesMapping.get(type).get("cobj"); -// final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder(); -// -// // Adding hostedby -// instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder() -// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c") -// .setValue("Unknown Repository") -// .build()); -// -// // Adding url -// final String url = createRepeatedField(rootElement, "urls"); -// if (StringUtils.isNotBlank(url)) { -// instance.addUrl(url); -// } -// -// final String pubDate = getPublicationDate(rootElement, "publication_date"); -// if (StringUtils.isNotBlank(pubDate)) { -// instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); -// } -// -// instance.setCollectedfrom(collectedFrom); -// -// // Adding accessright -// instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder() -// .setClassid("UNKNOWN") -// .setClassname("UNKNOWN") -// .setSchemeid("dnet:access_modes") -// .setSchemename("dnet:access_modes") -// .build()); -// -// // Adding type -// instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() -// .setClassid(cobjValue) -// .setClassname(typeValue) -// .setSchemeid("dnet:publication_resource") -// .setSchemename("dnet:publication_resource") -// .build()); -// -// result.addInstance(instance); -// } else { -// context.incrementCounter("filtered", "type_not_found", 1); -// return null; -// } -// -// // Adding authors -// final List authors = createAuthors(rootElement); -// if (authors != null && authors.size() > 0) { -// metadata.addAllAuthor(authors); -// } else { -// context.incrementCounter("filtered", "author_not_found", 1); -// return null; -// } -// -// metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies")); -// result.setMetadata(metadata.build()); -// entity.setResult(result.build()); -// oaf.setEntity(entity.build()); -// -// final List actionList = new ArrayList<>(); -// -// actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray())); -// -//// System.out.println(JsonFormat.printToString(oaf.build())); -// return actionList; -// -// } -// -// public static List createAuthors(final JsonObject root) { -// -// final String authorsJSONFieldName = "authors"; -// -// if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { -// -// final List authors = new ArrayList<>(); -// final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); -// int firstCounter = 0; -// int defaultCounter = 0; -// int rank = 1; -// int currentRank = 0; -// -// for (final JsonElement item : jsonAuthors) { -// final JsonObject author = item.getAsJsonObject(); -// final Author.Builder result = Author.newBuilder(); -// if (item.isJsonObject()) { -// final String surname = getStringValue(author, "surname"); -// final String name = getStringValue(author, "name"); -// final String oid = getStringValue(author, "oid"); -// final String seq = getStringValue(author, "seq"); -// if (StringUtils.isNotBlank(seq)) { -// if (seq.equals("first")) { -// firstCounter += 1; -// rank = firstCounter; -// -// } else if (seq.equals("additional")) { -// rank = currentRank + 1; -// } else { -// defaultCounter += 1; -// rank = defaultCounter; -// } -// } -// -// if (StringUtils.isNotBlank(oid)) { -// result.addPid(KeyValue.newBuilder() -// .setValue(oid) -// .setKey("ORCID") -// .build()); -// result.setFullname(name + " " + surname); -// if (StringUtils.isNotBlank(name)) { -// result.setName(name); -// } -// if (StringUtils.isNotBlank(surname)) { -// result.setSurname(surname); -// } -// } else { -// String fullname = ""; -// if (StringUtils.isNotBlank(name)) { -// fullname = name; -// } else { -// if (StringUtils.isNotBlank(surname)) { -// fullname = surname; -// } -// } -// Person p = new Person(fullname, false); -// if (p.isAccurate()) { -// result.setName(p.getNormalisedFirstName()); -// result.setSurname(p.getNormalisedSurname()); -// result.setFullname(p.getNormalisedFullname()); -// } -// else { -// result.setFullname(fullname); -// } -// } -// } -// result.setRank(rank); -// authors.add(result.build()); -// currentRank = rank; -// } -// return authors; -// -// } -// return null; -// } -// -// private static String createRepeatedField(final JsonObject rootElement, final String fieldName) { -// String field = ""; -// if (!rootElement.has(fieldName)) { return null; } -// if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } -// if (rootElement.get(fieldName).isJsonArray()) { -// if (!isValidJsonArray(rootElement, fieldName)) { return null; } -// final StringBuilder ttl = new StringBuilder(); -// getArrayValues(rootElement, fieldName).forEach(ttl::append); -// field = ttl.toString(); -// } else { -// field = getStringValue(rootElement, fieldName); -// } -// -// if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') { -// field = field.substring(1, field.length() - 1); -// } -// return field; -// } -// -// private static void settingRelevantDate(final JsonObject rootElement, -// final ResultProtos.Result.Metadata.Builder metadata, -// final String jsonKey, -// final String dictionaryKey, -// final boolean addToDateOfAcceptance) { -// -// final String pubDate = getPublicationDate(rootElement, "publication_date"); -// if (StringUtils.isNotBlank(pubDate)) { -// if (addToDateOfAcceptance) { -// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); -// } -// metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder() -// .setValue(pubDate) -// .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date")) -// .build()); -// } -// } -// -// private static String getPublicationDate(final JsonObject rootElement, -// final String jsonKey) { -// -// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); -// if (pubDateJson == null) { return null; } -// final String year = getStringValue(pubDateJson, "year"); -// final String month = getStringValue(pubDateJson, "month"); -// final String day = getStringValue(pubDateJson, "day"); -// -// if (StringUtils.isBlank(year)) { return null; } -// String pubDate = "".concat(year); -// if (StringUtils.isNotBlank(month)) { -// pubDate = pubDate.concat("-" + month); -// if (StringUtils.isNotBlank(day)) { -// pubDate = pubDate.concat("-" + day); -// } else { -// pubDate += "-01"; -// } -// } else { -// pubDate += "-01-01"; -// } -// if (isValidDate(pubDate)) { return pubDate; } -// return null; -// } -// -// protected static boolean isValid(final JsonObject rootElement, final Reporter context) { -// -// final String type = getStringValue(rootElement, "type"); -// if (!typologiesMapping.containsKey(type)) { -// context.incrementCounter("filtered", "unknowntype_" + type, 1); -// return false; -// } -// -// if (!isValidJsonArray(rootElement, "titles")) { -// context.incrementCounter("filtered", "invalid_title", 1); -// return false; -// } -// return true; -// } -// -// private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { -// if (!rootElement.has(fieldName)) { return false; } -// final JsonElement jsonElement = rootElement.get(fieldName); -// if (jsonElement.isJsonNull()) { return false; } -// if (jsonElement.isJsonArray()) { -// final JsonArray jsonArray = jsonElement.getAsJsonArray(); -// if (jsonArray.isJsonNull()) { return false; } -// if (jsonArray.get(0).isJsonNull()) { return false; } -// } -// return true; -// } -//} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java new file mode 100644 index 000000000..c460f6299 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java @@ -0,0 +1,107 @@ +package eu.dnetlib.doiboost.orcidnodoi.util; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import org.apache.commons.lang3.StringUtils; + +import java.text.SimpleDateFormat; +import java.util.*; + +public class DumpToActionsUtility { + + private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); + + public static String getStringValue(final JsonObject root, final String key) { + if (root.has(key) && !root.get(key).isJsonNull()) + return root.get(key).getAsString(); + return null; + } + + public static List getArrayValues(final JsonObject root, final String key) { + if (root.has(key) && root.get(key).isJsonArray()) { + final JsonArray asJsonArray = root.get(key).getAsJsonArray(); + final List result = new ArrayList<>(); + + + asJsonArray.forEach(it -> { + if (StringUtils.isNotBlank(it.getAsString())) { + result.add(it.getAsString()); + } + }); + return result; + } + return new ArrayList<>(); + } + public static List getArrayObjects(final JsonObject root, final String key) { + if (root.has(key) && root.get(key).isJsonArray()) { + final JsonArray asJsonArray = root.get(key).getAsJsonArray(); + final List result = new ArrayList<>(); + asJsonArray.forEach(it -> { + if (it.getAsJsonObject() != null) { + result.add(it.getAsJsonObject()); + } + }); + return result; + } + return new ArrayList<>(); + } + + public static boolean isValidDate(final String date) { + return date.matches("\\d{4}-\\d{2}-\\d{2}"); + } + + public static String now_ISO8601() { // NOPMD + String result; + synchronized (ISO8601FORMAT) { + result = ISO8601FORMAT.format(new Date()); + } + //convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00 + //- note the added colon for the Timezone + return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2); + } + + public static String getDefaultResulttype(final String cobjcategory) { + switch (cobjcategory) { + case "0029": + return "software"; + case "0021": + case "0024": + case "0025": + case "0030": + return "dataset"; + case "0000": + case "0010": + case "0018": + case "0020": + case "0022": + case "0023": + case "0026": + case "0027": + case "0028": + case "0037": + return "other"; + case "0001": + case "0002": + case "0004": + case "0005": + case "0006": + case "0007": + case "0008": + case "0009": + case "0011": + case "0012": + case "0013": + case "0014": + case "0015": + case "0016": + case "0017": + case "0019": + case "0031": + case "0032": + return "publication"; + default: + return "publication"; + } + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java new file mode 100644 index 000000000..58c09af60 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java @@ -0,0 +1,30 @@ +package eu.dnetlib.doiboost.orcidnodoi.util; + +public class Pair { + + private K k; + + private V v; + + public Pair(K k, V v) { + this.k = k; + this.v = v; + } + + public K getKey() { + return k; + } + + public V getValue() { + return v; + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof Pair) { + Pair tmp = (Pair) obj; + return k.equals(tmp.getKey()) && v.equals(tmp.getValue()); + } else return false; + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json new file mode 100644 index 000000000..cb696f279 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json @@ -0,0 +1,41 @@ +{ + "reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"}, + "report": {"cobj":"0017", "value": "Report"}, + "dataset": {"cobj":"0021", "value": "Dataset"}, + "journal-article": {"cobj":"0001", "value": "Article"}, + "reference-book": {"cobj":"0002", "value": "Book"}, + "other": {"cobj":"0020", "value": "Other ORP type"}, + "proceedings-article": {"cobj":"0004", "value": "Conference object"}, + "standard": {"cobj":"0038", "value": "Other literature type"}, + "book-part": {"cobj":"0002", "value": "Book"}, + "monograph": {"cobj":"0002", "value": "Book"}, + "report-series": {"cobj":"0017", "value": "Report"}, + "book": {"cobj":"0002", "value": "Book"}, + "book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"}, + "peer-review": {"cobj":"0015", "value": "Review"}, + "book-section": {"cobj":"0013", "value": "Part of book or chapter of book"}, + "book-review": {"cobj":"0015", "value": "Review"}, + "conference-abstract": {"cobj":"0004", "value": "Conference object"}, + "conference-paper": {"cobj":"0004", "value": "Conference object"}, + "conference-poster": {"cobj":"0004", "value": "Conference object"}, + "data-set": {"cobj":"0021", "value": "Dataset"}, + "dictionary-entry": {"cobj":"0038", "value": "Other literature type"}, + "disclosure": {"cobj":"0038", "value": "Other literature type"}, + "dissertation": {"cobj":"0006", "value": "Doctoral thesis"}, + "edited-book": {"cobj":"0002", "value": "Book"}, + "encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"}, + "lecture-speech": {"cobj":"0010", "value": "Lecture"}, + "license": {"cobj":"0038", "value": "Other literature type"}, + "magazine-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"}, + "manual": {"cobj":"0038", "value": "Other literature type"}, + "newsletter-article": {"cobj":"0012", "value": "Newsletter"}, + "newspaper-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"}, + "patent": {"cobj":"0019", "value": "Patent"}, + "research-technique": {"cobj":"0020", "value": "Other ORP type"}, + "research-tool": {"cobj":"0020", "value": "Other ORP type"}, + "standards-and-policy": {"cobj":"0038", "value": "Other literature type"}, + "supervised-student-publication": {"cobj":"0001", "value": "Article"}, + "technical-standard": {"cobj":"0038", "value": "Other literature type"}, + "website": {"cobj":"0020", "value": "Other ORP type"}, + "working-paper": {"cobj":"0014", "value": "Research"} +} \ No newline at end of file From 1729cc5cf320c32cdafa2523d884d965ccefdc98 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 2 Jul 2020 18:46:20 +0200 Subject: [PATCH 06/34] publication conversion from json to oaf test --- .../orcidnodoi/oaf/OrcidWorkToOAF.java | 420 ---------------- .../orcidnodoi/oaf/PublicationToOaf.java | 456 ++++++++++++++++++ .../orcidnodoi/util/DumpToActionsUtility.java | 184 +++---- .../doiboost/orcidnodoi/util/Pair.java | 40 +- .../doiboost/orcid/OrcidClientTest.java | 2 +- .../orcidnodoi/PublicationToOafTest.java | 76 +++ .../orcidnodoi/xml/OrcidNoDoiTest.java | 3 +- .../doiboost/orcidnodoi/publication.json | 1 + 8 files changed, 650 insertions(+), 532 deletions(-) delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java deleted file mode 100644 index 673abb407..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java +++ /dev/null @@ -1,420 +0,0 @@ - -package eu.dnetlib.doiboost.orcidnodoi.oaf; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks; -import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; -import eu.dnetlib.doiboost.orcidnodoi.util.Pair; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*; - -public class OrcidWorkToOAF { - - static Logger logger = LoggerFactory.getLogger(OrcidWorkToOAF.class); - - public static final String ORCID = "ORCID"; - public final static String orcidPREFIX = "orcid_______"; - public static final String OPENAIRE_PREFIX = "openaire____"; - public static final String SEPARATOR = "::"; - - private static Map> datasources = new HashMap>() { - - { - put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); - - } - }; - - // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname - private static Map> externalIds = new HashMap>() { - - { - put("ark".toLowerCase(), new Pair<>("ark", "ark")); - put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); - put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); - put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); - put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); - put("urn".toLowerCase(), new Pair<>("urn", "urn")); - } - }; - - static Map> typologiesMapping; - - static { - try { - final String tt = IOUtils.toString(OrcidWorkToOAF.class.getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json")); - typologiesMapping = new Gson().fromJson(tt, Map.class); - } catch (final Exception e) { - logger.error("loading typologies", e); - } - } - - public static final String PID_TYPES = "dnet:pid_types"; - - public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement, final String setName) { - - if (!isValid(rootElement/*, context*/)) { return null; } - - Publication publication = new Publication(); - - final DataInfo dataInfo = new DataInfo(); - dataInfo.setDeletedbyinference(false); - dataInfo.setInferred(false); - dataInfo.setTrust("0.9"); - dataInfo.setProvenanceaction( - mapQualifier( - "sysimport:actionset:orcidworks-no-doi", - "sysimport:actionset:orcidworks-no-doi", - "dnet:provenanceActions", - "dnet:provenanceActions")); - publication.setDataInfo(dataInfo); - - publication.setLastupdatetimestamp(new Date().getTime()); - - publication.setDateofcollection("2019-10-22"); - publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); - - // Adding external ids - externalIds.keySet().stream() - .forEach(jsonExtId -> { - final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); - final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); - final String extId = getStringValue(rootElement, jsonExtId); - if (StringUtils.isNotBlank(extId)) { - publication.getExternalReference().add( - convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types")); - } - }); - - // Adding source -// final String source = getStringValue(rootElement, "source"); -// if (StringUtils.isNotBlank(source)) { -// metadata.addSource(StringField.newBuilder().setValue(source).build()); -// } - - // Adding titles - final List titles = createRepeatedField(rootElement, "titles"); - if (titles==null || titles.isEmpty()) { -// context.incrementCounter("filtered", "title_not_found", 1); - return null; - } - Qualifier q = mapQualifier("main title","main title","dnet:dataCite_title","dnet:dataCite_title"); - publication.setTitle( - titles - .stream() - .map(t -> { - return mapStructuredProperty(t, q, null); - }) - .collect(Collectors.toList())); - // Adding identifier - final String id = getStringValue(rootElement, "id"); - String sourceId = null; - if (id != null) { - publication.setOriginalId(Arrays.asList(id)); - sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase())); - } else { - String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(",")); - sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase())); - } - publication.setId(sourceId); - - // Adding relevant date - settingRelevantDate(rootElement, publication, "publication_date", "issued", true); - - // Adding collectedfrom - publication.setCollectedfrom(Arrays.asList(createCollectedFrom())); - - // Adding type - final String type = getStringValue(rootElement, "type"); - String cobjValue = ""; - if (StringUtils.isNotBlank(type)) { - publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource")); - - final String typeValue = typologiesMapping.get(type).get("value"); - cobjValue = typologiesMapping.get(type).get("cobj"); - final Instance instance = new Instance(); - - // Adding hostedby - instance.setHostedby(createHostedBy()); - - // Adding url - final List urls = createRepeatedField(rootElement, "urls"); - if (urls!=null && !urls.isEmpty()) { - instance.setUrl(urls); - } - - final String pubDate = getPublicationDate(rootElement, "publication_date"); - if (StringUtils.isNotBlank(pubDate)) { - instance.setDateofacceptance(mapStringField(pubDate, null)); - } - - instance.setCollectedfrom(createCollectedFrom()); - - // Adding accessright - instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes")); - - // Adding type - instance.setInstancetype(mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource")); - - publication.setInstance(Arrays.asList(instance)); - } else { -// context.incrementCounter("filtered", "type_not_found", 1); - return null; - } - - // Adding authors - final List authors = createAuthors(rootElement); - if (authors != null && authors.size() > 0) { - publication.setAuthor(authors); - } else { -// context.incrementCounter("filtered", "author_not_found", 1); - return null; - } - String classValue = getDefaultResulttype(cobjValue); - publication.setResulttype(mapQualifier(classValue, classValue,"dnet:result_typologies", "dnet:result_typologies")); - return publication; - } - - public static List createAuthors(final JsonObject root) { - - final String authorsJSONFieldName = "authors"; - - if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { - - final List authors = new ArrayList<>(); - final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); - int firstCounter = 0; - int defaultCounter = 0; - int rank = 1; - int currentRank = 0; - - for (final JsonElement item : jsonAuthors) { - final JsonObject jsonAuthor = item.getAsJsonObject(); - final Author author = new Author(); - if (item.isJsonObject()) { - final String surname = getStringValue(jsonAuthor, "surname"); - final String name = getStringValue(jsonAuthor, "name"); - final String oid = getStringValue(jsonAuthor, "oid"); - final String seq = getStringValue(jsonAuthor, "seq"); - if (StringUtils.isNotBlank(seq)) { - if (seq.equals("first")) { - firstCounter += 1; - rank = firstCounter; - - } else if (seq.equals("additional")) { - rank = currentRank + 1; - } else { - defaultCounter += 1; - rank = defaultCounter; - } - } - - if (StringUtils.isNotBlank(oid)) { - author.setPid(Arrays.asList(mapAuthorId(oid))); - author.setFullname(name + " " + surname); - if (StringUtils.isNotBlank(name)) { - author.setName(name); - } - if (StringUtils.isNotBlank(surname)) { - author.setSurname(surname); - } - } else { - String fullname = ""; - if (StringUtils.isNotBlank(name)) { - fullname = name; - } else { - if (StringUtils.isNotBlank(surname)) { - fullname = surname; - } - } - PacePerson p = new PacePerson(fullname, false); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - author.setFullname(p.getNormalisedFullname()); - } - else { - author.setFullname(fullname); - } - } - } - author.setRank(rank); - authors.add(author); - currentRank = rank; - } - return authors; - - } - return null; - } - - private static List createRepeatedField(final JsonObject rootElement, final String fieldName) { - if (!rootElement.has(fieldName)) { return null; } - if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } - if (rootElement.get(fieldName).isJsonArray()) { - if (!isValidJsonArray(rootElement, fieldName)) { return null; } - return getArrayValues(rootElement, fieldName); - } else { - String field = getStringValue(rootElement, fieldName); - return Arrays.asList(cleanField(field)); - } - } - - private static String cleanField(String value) { - if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') { - value = value.substring(1, value.length() - 1); - } - return value; - } - - private static void settingRelevantDate(final JsonObject rootElement, - final Publication publication, - final String jsonKey, - final String dictionaryKey, - final boolean addToDateOfAcceptance) { - - final String pubDate = getPublicationDate(rootElement, "publication_date"); - if (StringUtils.isNotBlank(pubDate)) { - if (addToDateOfAcceptance) { - publication.setDateofacceptance(mapStringField(pubDate, null)); - } - Qualifier q = mapQualifier(dictionaryKey,dictionaryKey,"dnet:dataCite_date","dnet:dataCite_date"); - publication.setRelevantdate( - Arrays.asList(pubDate) - .stream() - .map(r -> { - return mapStructuredProperty(r, q, null); - }) - .collect(Collectors.toList())); - } - } - - private static String getPublicationDate(final JsonObject rootElement, - final String jsonKey) { - - final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); - if (pubDateJson == null) { return null; } - final String year = getStringValue(pubDateJson, "year"); - final String month = getStringValue(pubDateJson, "month"); - final String day = getStringValue(pubDateJson, "day"); - - if (StringUtils.isBlank(year)) { return null; } - String pubDate = "".concat(year); - if (StringUtils.isNotBlank(month)) { - pubDate = pubDate.concat("-" + month); - if (StringUtils.isNotBlank(day)) { - pubDate = pubDate.concat("-" + day); - } else { - pubDate += "-01"; - } - } else { - pubDate += "-01-01"; - } - if (isValidDate(pubDate)) { return pubDate; } - return null; - } - - protected static boolean isValid(final JsonObject rootElement/*, final Reporter context*/) { - - final String type = getStringValue(rootElement, "type"); - if (!typologiesMapping.containsKey(type)) { -// context.incrementCounter("filtered", "unknowntype_" + type, 1); - return false; - } - - if (!isValidJsonArray(rootElement, "titles")) { -// context.incrementCounter("filtered", "invalid_title", 1); - return false; - } - return true; - } - - private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { - if (!rootElement.has(fieldName)) { return false; } - final JsonElement jsonElement = rootElement.get(fieldName); - if (jsonElement.isJsonNull()) { return false; } - if (jsonElement.isJsonArray()) { - final JsonArray jsonArray = jsonElement.getAsJsonArray(); - if (jsonArray.isJsonNull()) { return false; } - if (jsonArray.get(0).isJsonNull()) { return false; } - } - return true; - } - - private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) { - final Qualifier qualifier = new Qualifier(); - qualifier.setClassid(classId); - qualifier.setClassname(className); - qualifier.setSchemeid(schemeId); - qualifier.setSchemename(schemeName); - return qualifier; - } - - private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) { - ExternalReference ex = new ExternalReference(); - ex.setRefidentifier(extId); - ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName )); - return ex; - } - - private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) { - if (value == null | StringUtils.isBlank(value)) { - return null; - } - - final StructuredProperty structuredProperty = new StructuredProperty(); - structuredProperty.setValue(value); - structuredProperty.setQualifier(qualifier); - structuredProperty.setDataInfo(dataInfo); - return structuredProperty; - } - - private static Field mapStringField(String value, DataInfo dataInfo) { - if (value == null || StringUtils.isBlank(value)) { - return null; - } - - final Field stringField = new Field<>(); - stringField.setValue(value); - stringField.setDataInfo(dataInfo); - return stringField; - } - - private static KeyValue createCollectedFrom() { - KeyValue cf = new KeyValue(); - cf.setValue(ORCID); - cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); - return cf; - } - - private static KeyValue createHostedBy() { - KeyValue hb = new KeyValue(); - hb.setValue("Unknown Repository"); - hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); - return hb; - } - - private static StructuredProperty mapAuthorId(String orcidId) { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(orcidId); - final Qualifier q = new Qualifier(); - q.setClassid("ORCID"); - q.setClassname("ORCID"); - sp.setQualifier(q); - return sp; - } -} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java new file mode 100644 index 000000000..dc03767ec --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -0,0 +1,456 @@ + +package eu.dnetlib.doiboost.orcidnodoi.oaf; + +import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*; + +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +import eu.dnetlib.dhp.common.PacePerson; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; +import eu.dnetlib.doiboost.orcidnodoi.util.Pair; + +public class PublicationToOaf { + + static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); + + public static final String ORCID = "ORCID"; + public final static String orcidPREFIX = "orcid_______"; + public static final String OPENAIRE_PREFIX = "openaire____"; + public static final String SEPARATOR = "::"; + + private static Map> datasources = new HashMap>() { + + { + put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); + + } + }; + + // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname + private static Map> externalIds = new HashMap>() { + + { + put("ark".toLowerCase(), new Pair<>("ark", "ark")); + put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); + put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); + put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); + put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); + put("urn".toLowerCase(), new Pair<>("urn", "urn")); + } + }; + + static Map> typologiesMapping; + + static { + try { + final String tt = IOUtils + .toString( + PublicationToOaf.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json")); + typologiesMapping = new Gson().fromJson(tt, Map.class); + } catch (final Exception e) { + logger.error("loading typologies", e); + } + } + + public static final String PID_TYPES = "dnet:pid_types"; + + public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { + + logger.debug("generatePublicationActionsFromDump ..."); + if (!isValid(rootElement/* , context */)) { + logger.error("publication not valid"); + return null; + } + + Publication publication = new Publication(); + + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferred(false); + dataInfo.setTrust("0.9"); + dataInfo + .setProvenanceaction( + mapQualifier( + "sysimport:actionset:orcidworks-no-doi", + "sysimport:actionset:orcidworks-no-doi", + "dnet:provenanceActions", + "dnet:provenanceActions")); + publication.setDataInfo(dataInfo); + + publication.setLastupdatetimestamp(new Date().getTime()); + + publication.setDateofcollection("2019-10-22"); + publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); + + // Adding external ids + externalIds + .keySet() + .stream() + .forEach(jsonExtId -> { + final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); + final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); + final String extId = getStringValue(rootElement, jsonExtId); + if (StringUtils.isNotBlank(extId)) { + publication + .getExternalReference() + .add( + convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types")); + } + }); + + // Adding source + final String source = getStringValue(rootElement, "sourceName"); + if (StringUtils.isNotBlank(source)) { + publication.setSource(Arrays.asList(mapStringField(source, null))); + } + + // Adding titles + final List titles = createRepeatedField(rootElement, "titles"); + if (titles == null || titles.isEmpty()) { + logger.error("titles not found"); +// context.incrementCounter("filtered", "title_not_found", 1); + return null; + } + Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + publication + .setTitle( + titles + .stream() + .map(t -> { + return mapStructuredProperty(t, q, null); + }) + .collect(Collectors.toList())); + // Adding identifier + final String id = getStringValue(rootElement, "id"); + String sourceId = null; + if (id != null) { + publication.setOriginalId(Arrays.asList(id)); + sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase())); + } else { + String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(",")); + sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase())); + } + publication.setId(sourceId); + + // Adding relevant date + settingRelevantDate(rootElement, publication, "publication_date", "issued", true); + + // Adding collectedfrom + publication.setCollectedfrom(Arrays.asList(createCollectedFrom())); + + // Adding type + final String type = getStringValue(rootElement, "type"); + String cobjValue = ""; + if (StringUtils.isNotBlank(type)) { + publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource")); + + final String typeValue = typologiesMapping.get(type).get("value"); + cobjValue = typologiesMapping.get(type).get("cobj"); + final Instance instance = new Instance(); + + // Adding hostedby + instance.setHostedby(createHostedBy()); + + // Adding url + final List urls = createRepeatedField(rootElement, "urls"); + if (urls != null && !urls.isEmpty()) { + instance.setUrl(urls); + } + + final String pubDate = getPublicationDate(rootElement, "publication_date"); + if (StringUtils.isNotBlank(pubDate)) { + instance.setDateofacceptance(mapStringField(pubDate, null)); + } + + instance.setCollectedfrom(createCollectedFrom()); + + // Adding accessright + instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes")); + + // Adding type + instance + .setInstancetype( + mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource")); + + publication.setInstance(Arrays.asList(instance)); + } else { + logger.error("type not found"); +// context.incrementCounter("filtered", "type_not_found", 1); + return null; + } + + // Adding authors + final List authors = createAuthors(rootElement); + if (authors != null && authors.size() > 0) { + publication.setAuthor(authors); + } else { + logger.error("authors not found"); +// context.incrementCounter("filtered", "author_not_found", 1); + return null; + } + String classValue = getDefaultResulttype(cobjValue); + publication + .setResulttype(mapQualifier(classValue, classValue, "dnet:result_typologies", "dnet:result_typologies")); + return publication; + } + + public static List createAuthors(final JsonObject root) { + + final String authorsJSONFieldName = "contributors"; + + if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { + + final List authors = new ArrayList<>(); + final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); + int firstCounter = 0; + int defaultCounter = 0; + int rank = 1; + int currentRank = 0; + + for (final JsonElement item : jsonAuthors) { + final JsonObject jsonAuthor = item.getAsJsonObject(); + final Author author = new Author(); + if (item.isJsonObject()) { + final String creditname = getStringValue(jsonAuthor, "creditName"); + final String surname = getStringValue(jsonAuthor, "surname"); + final String name = getStringValue(jsonAuthor, "name"); + final String oid = getStringValue(jsonAuthor, "oid"); + final String seq = getStringValue(jsonAuthor, "sequence"); + if (StringUtils.isNotBlank(seq)) { + if (seq.equals("first")) { + firstCounter += 1; + rank = firstCounter; + + } else if (seq.equals("additional")) { + rank = currentRank + 1; + } else { + defaultCounter += 1; + rank = defaultCounter; + } + } + if (StringUtils.isNotBlank(oid)) { + author.setPid(Arrays.asList(mapAuthorId(oid))); + author.setFullname(name + " " + surname); + if (StringUtils.isNotBlank(name)) { + author.setName(name); + } + if (StringUtils.isNotBlank(surname)) { + author.setSurname(surname); + } + } else { + PacePerson p = new PacePerson(creditname, false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + author.setFullname(p.getNormalisedFullname()); + } else { + author.setFullname(creditname); + } + } + } + author.setRank(rank); + authors.add(author); + currentRank = rank; + } + return authors; + + } + return null; + } + + private static List createRepeatedField(final JsonObject rootElement, final String fieldName) { + if (!rootElement.has(fieldName)) { + return null; + } + if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { + return null; + } + if (rootElement.get(fieldName).isJsonArray()) { + if (!isValidJsonArray(rootElement, fieldName)) { + return null; + } + return getArrayValues(rootElement, fieldName); + } else { + String field = getStringValue(rootElement, fieldName); + return Arrays.asList(cleanField(field)); + } + } + + private static String cleanField(String value) { + if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') { + value = value.substring(1, value.length() - 1); + } + return value; + } + + private static void settingRelevantDate(final JsonObject rootElement, + final Publication publication, + final String jsonKey, + final String dictionaryKey, + final boolean addToDateOfAcceptance) { + + final String pubDate = getPublicationDate(rootElement, "publication_date"); + if (StringUtils.isNotBlank(pubDate)) { + if (addToDateOfAcceptance) { + publication.setDateofacceptance(mapStringField(pubDate, null)); + } + Qualifier q = mapQualifier(dictionaryKey, dictionaryKey, "dnet:dataCite_date", "dnet:dataCite_date"); + publication + .setRelevantdate( + Arrays + .asList(pubDate) + .stream() + .map(r -> { + return mapStructuredProperty(r, q, null); + }) + .collect(Collectors.toList())); + } + } + + private static String getPublicationDate(final JsonObject rootElement, + final String jsonKey) { + + final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); + if (pubDateJson == null) { + return null; + } + final String year = getStringValue(pubDateJson, "year"); + final String month = getStringValue(pubDateJson, "month"); + final String day = getStringValue(pubDateJson, "day"); + + if (StringUtils.isBlank(year)) { + return null; + } + String pubDate = "".concat(year); + if (StringUtils.isNotBlank(month)) { + pubDate = pubDate.concat("-" + month); + if (StringUtils.isNotBlank(day)) { + pubDate = pubDate.concat("-" + day); + } else { + pubDate += "-01"; + } + } else { + pubDate += "-01-01"; + } + if (isValidDate(pubDate)) { + return pubDate; + } + return null; + } + + protected static boolean isValid(final JsonObject rootElement/* , final Reporter context */) { + + final String type = getStringValue(rootElement, "type"); + if (!typologiesMapping.containsKey(type)) { + logger.error("unknowntype_" + type); +// context.incrementCounter("filtered", "unknowntype_" + type, 1); + return false; + } + + if (!isValidJsonArray(rootElement, "titles")) { + logger.error("invalid_title"); +// context.incrementCounter("filtered", "invalid_title", 1); + return false; + } + return true; + } + + private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { + if (!rootElement.has(fieldName)) { + return false; + } + final JsonElement jsonElement = rootElement.get(fieldName); + if (jsonElement.isJsonNull()) { + return false; + } + if (jsonElement.isJsonArray()) { + final JsonArray jsonArray = jsonElement.getAsJsonArray(); + if (jsonArray.isJsonNull()) { + return false; + } + if (jsonArray.get(0).isJsonNull()) { + return false; + } + } + return true; + } + + private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(classId); + qualifier.setClassname(className); + qualifier.setSchemeid(schemeId); + qualifier.setSchemename(schemeName); + return qualifier; + } + + private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, + String schemeName) { + ExternalReference ex = new ExternalReference(); + ex.setRefidentifier(extId); + ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName)); + return ex; + } + + private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) { + if (value == null | StringUtils.isBlank(value)) { + return null; + } + + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(value); + structuredProperty.setQualifier(qualifier); + structuredProperty.setDataInfo(dataInfo); + return structuredProperty; + } + + private static Field mapStringField(String value, DataInfo dataInfo) { + if (value == null || StringUtils.isBlank(value)) { + return null; + } + + final Field stringField = new Field<>(); + stringField.setValue(value); + stringField.setDataInfo(dataInfo); + return stringField; + } + + private static KeyValue createCollectedFrom() { + KeyValue cf = new KeyValue(); + cf.setValue(ORCID); + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); + return cf; + } + + private static KeyValue createHostedBy() { + KeyValue hb = new KeyValue(); + hb.setValue("Unknown Repository"); + hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); + return hb; + } + + private static StructuredProperty mapAuthorId(String orcidId) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(orcidId); + final Qualifier q = new Qualifier(); + q.setClassid("ORCID"); + q.setClassname("ORCID"); + sp.setQualifier(q); + return sp; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java index c460f6299..9b9f3c8b2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java @@ -1,107 +1,109 @@ -package eu.dnetlib.doiboost.orcidnodoi.util; -import com.google.gson.JsonArray; -import com.google.gson.JsonObject; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.doiboost.orcidnodoi.util; import java.text.SimpleDateFormat; import java.util.*; +import org.apache.commons.lang3.StringUtils; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; + public class DumpToActionsUtility { - private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); + private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); - public static String getStringValue(final JsonObject root, final String key) { - if (root.has(key) && !root.get(key).isJsonNull()) - return root.get(key).getAsString(); - return null; - } + public static String getStringValue(final JsonObject root, final String key) { + if (root.has(key) && !root.get(key).isJsonNull()) + return root.get(key).getAsString(); + return null; + } - public static List getArrayValues(final JsonObject root, final String key) { - if (root.has(key) && root.get(key).isJsonArray()) { - final JsonArray asJsonArray = root.get(key).getAsJsonArray(); - final List result = new ArrayList<>(); + public static List getArrayValues(final JsonObject root, final String key) { + if (root.has(key) && root.get(key).isJsonArray()) { + final JsonArray asJsonArray = root.get(key).getAsJsonArray(); + final List result = new ArrayList<>(); + asJsonArray.forEach(it -> { + if (StringUtils.isNotBlank(it.getAsString())) { + result.add(it.getAsString()); + } + }); + return result; + } + return new ArrayList<>(); + } - asJsonArray.forEach(it -> { - if (StringUtils.isNotBlank(it.getAsString())) { - result.add(it.getAsString()); - } - }); - return result; - } - return new ArrayList<>(); - } - public static List getArrayObjects(final JsonObject root, final String key) { - if (root.has(key) && root.get(key).isJsonArray()) { - final JsonArray asJsonArray = root.get(key).getAsJsonArray(); - final List result = new ArrayList<>(); - asJsonArray.forEach(it -> { - if (it.getAsJsonObject() != null) { - result.add(it.getAsJsonObject()); - } - }); - return result; - } - return new ArrayList<>(); - } + public static List getArrayObjects(final JsonObject root, final String key) { + if (root.has(key) && root.get(key).isJsonArray()) { + final JsonArray asJsonArray = root.get(key).getAsJsonArray(); + final List result = new ArrayList<>(); + asJsonArray.forEach(it -> { + if (it.getAsJsonObject() != null) { + result.add(it.getAsJsonObject()); + } + }); + return result; + } + return new ArrayList<>(); + } - public static boolean isValidDate(final String date) { - return date.matches("\\d{4}-\\d{2}-\\d{2}"); - } + public static boolean isValidDate(final String date) { + return date.matches("\\d{4}-\\d{2}-\\d{2}"); + } - public static String now_ISO8601() { // NOPMD - String result; - synchronized (ISO8601FORMAT) { - result = ISO8601FORMAT.format(new Date()); - } - //convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00 - //- note the added colon for the Timezone - return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2); - } + public static String now_ISO8601() { // NOPMD + String result; + synchronized (ISO8601FORMAT) { + result = ISO8601FORMAT.format(new Date()); + } + // convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00 + // - note the added colon for the Timezone + return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2); + } - public static String getDefaultResulttype(final String cobjcategory) { - switch (cobjcategory) { - case "0029": - return "software"; - case "0021": - case "0024": - case "0025": - case "0030": - return "dataset"; - case "0000": - case "0010": - case "0018": - case "0020": - case "0022": - case "0023": - case "0026": - case "0027": - case "0028": - case "0037": - return "other"; - case "0001": - case "0002": - case "0004": - case "0005": - case "0006": - case "0007": - case "0008": - case "0009": - case "0011": - case "0012": - case "0013": - case "0014": - case "0015": - case "0016": - case "0017": - case "0019": - case "0031": - case "0032": - return "publication"; - default: - return "publication"; - } - } + public static String getDefaultResulttype(final String cobjcategory) { + switch (cobjcategory) { + case "0029": + return "software"; + case "0021": + case "0024": + case "0025": + case "0030": + return "dataset"; + case "0000": + case "0010": + case "0018": + case "0020": + case "0022": + case "0023": + case "0026": + case "0027": + case "0028": + case "0037": + return "other"; + case "0001": + case "0002": + case "0004": + case "0005": + case "0006": + case "0007": + case "0008": + case "0009": + case "0011": + case "0012": + case "0013": + case "0014": + case "0015": + case "0016": + case "0017": + case "0019": + case "0031": + case "0032": + return "publication"; + default: + return "publication"; + } + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java index 58c09af60..8883d00f5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java @@ -1,30 +1,32 @@ + package eu.dnetlib.doiboost.orcidnodoi.util; public class Pair { - private K k; + private K k; - private V v; + private V v; - public Pair(K k, V v) { - this.k = k; - this.v = v; - } + public Pair(K k, V v) { + this.k = k; + this.v = v; + } - public K getKey() { - return k; - } + public K getKey() { + return k; + } - public V getValue() { - return v; - } + public V getValue() { + return v; + } - @Override - public boolean equals(Object obj) { - if (obj instanceof Pair) { - Pair tmp = (Pair) obj; - return k.equals(tmp.getKey()) && v.equals(tmp.getValue()); - } else return false; - } + @Override + public boolean equals(Object obj) { + if (obj instanceof Pair) { + Pair tmp = (Pair) obj; + return k.equals(tmp.getKey()) && v.equals(tmp.getValue()); + } else + return false; + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 75f857ca4..8b50f2d8f 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -54,7 +54,7 @@ public class OrcidClientTest { } // @Test - public void testLambdaFileParser() throws Exception { + private void testLambdaFileParser() throws Exception { try (BufferedReader br = new BufferedReader( new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) { String line; diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java new file mode 100644 index 000000000..4d04e1a16 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java @@ -0,0 +1,76 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import static org.junit.jupiter.api.Assertions.*; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; + +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; + +public class PublicationToOafTest { + + private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class); + + @Test +// @Ignore + public void convertOafPublicationTest() throws Exception { + String jsonPublication = IOUtils + .toString( + PublicationToOafTest.class.getResourceAsStream("publication.json")); + JsonElement j = new JsonParser().parse(jsonPublication); + logger.info("json publication loaded: " + j.toString()); + Publication oafPublication = (Publication) PublicationToOaf + .generatePublicationActionsFromDump(j.getAsJsonObject()); + assertNotNull(oafPublication.getId()); + assertNotNull(oafPublication.getOriginalId()); + assertEquals(oafPublication.getOriginalId().get(0), "60153327"); + logger.info("oafPublication.getId(): " + oafPublication.getId()); + assertEquals( + oafPublication.getTitle().get(0).getValue(), + "Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp"); + assertNotNull(oafPublication.getLastupdatetimestamp()); + assertNotNull(oafPublication.getDateofcollection()); + assertNotNull(oafPublication.getDateoftransformation()); + assertTrue(oafPublication.getAuthor().size() == 7); + oafPublication.getAuthor().forEach(a -> { + assertNotNull(a.getFullname()); + assertNotNull(a.getRank()); + logger.info("a.getFullname(): " + a.getFullname()); + if (a.getName() != null) { + logger.info("a.getName(): " + a.getName()); + } + if (a.getSurname() != null) { + logger.info("a.getSurname(): " + a.getSurname()); + } + logger.info("a.getRank(): " + a.getRank()); + if (a.getPid() != null) { + logger.info("a.getPid(): " + a.getPid().get(0).getValue()); + } + + }); + assertNotNull(oafPublication.getCollectedfrom()); + if (oafPublication.getSource() != null) { + logger.info((oafPublication.getSource().get(0).getValue())); + } + if (oafPublication.getExternalReference() != null) { + oafPublication.getExternalReference().forEach(e -> { + assertNotNull(e.getRefidentifier()); + assertEquals(e.getQualifier().getSchemeid(), "dnet:pid_types"); + }); + } + assertNotNull(oafPublication.getInstance()); + oafPublication.getInstance().forEach(i -> { + assertNotNull(i.getInstancetype().getClassid()); + logger.info("i.getInstancetype().getClassid(): " + i.getInstancetype().getClassid()); + assertNotNull(i.getInstancetype().getClassname()); + logger.info("i.getInstancetype().getClassname(): " + i.getInstancetype().getClassname()); + }); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index 6a5faddbd..d426b01f1 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -95,7 +95,8 @@ public class OrcidNoDoiTest { } @Test - public void authorMatchTest() throws Exception { + @Ignore + private void authorMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json new file mode 100644 index 000000000..579e12f2e --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json @@ -0,0 +1 @@ +{"oid":"0000-0002-4147-3387","id":"60153327","sourceName":"The Chinese University of Hong Kong","type":"conference-paper","titles":["Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp"],"extIds":[{"type":"wosuid","value":"000425015800225","relationShip":"self"},{"type":"other-id","value":"441f521e-ab19-448d-ba32-83157b348ada","relationShip":"self"}],"publicationDates":[],"contributors":[{"sequence":"1","oid":"0000-0002-4147-3387","name":"Elaine","surname":"Chow","creditName":"Elaine Chow"},{"sequence":"2","creditName":"Victor Tsui"},{"sequence":"3","creditName":"Achim Müller"},{"sequence":"4","creditName":"Vincy Lee"},{"sequence":"5","creditName":"Lucia Krivánekova"},{"sequence":"6","creditName":"Roland Krivánek"},{"sequence":"7","creditName":"Juliana CN Chan"}]} \ No newline at end of file From ca37d3427bc4bfe05932c9231e11ccdfb98752f2 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 3 Jul 2020 23:30:31 +0200 Subject: [PATCH 07/34] separate workflow to parse orcid summaries, activities and generate dataset with no doi publications; test --- .../orcid/OrcidAuthorsDOIsDataGen.java | 8 +- .../doiboost/orcid/OrcidDSManager.java | 14 +- .../doiboost/orcid/OrcidDownloader.java | 8 +- .../orcidnodoi/ActivitiesDumpReader.java | 6 +- .../orcidnodoi/GenOrcidAuthorWork.java | 3 +- .../SparkGenEnrichedOrcidWorks.java | 18 +- .../orcidnodoi/oaf/PublicationToOaf.java | 9 +- .../doiboost/create_orcid_authors_data.json | 2 +- .../create_orcid_authors_dois_data.json | 2 +- .../dhp/doiboost/download_orcid_data.json | 2 +- .../oozie_app/workflow.xml | 497 +---------------- .../dhp/doiboost/orcid/oozie_app/workflow.xml | 44 +- .../oozie_app/config-default.xml | 31 ++ .../orcid_activities/oozie_app/workflow.xml | 514 ++++++++++++++++++ .../oozie_app/config-default.xml | 22 + .../orcid_summaries/oozie_app/workflow.xml | 68 +++ .../doiboost/orcid/OrcidClientTest.java | 29 +- .../orcidnodoi/PublicationToOafTest.java | 5 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 4 +- .../xml/activity_work_0000-0002-2536-4498.xml | 72 +++ 20 files changed, 815 insertions(+), 543 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java index 70528a8f6..2ec4fe59d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java @@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager { public void generateAuthorsDOIsData() throws Exception { Configuration conf = initConfigurationObject(); FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz); - Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath)); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath)); ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath); } @@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager { hdfsServerUri = parser.get("hdfsServerUri"); Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); + workingPath = parser.get("workingPath"); + Log.info("Default Path: " + workingPath); activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); Log.info("Activities File Name: " + activitiesFileNameTarGz); outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index 4f846bdf3..aa61c0117 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; public class OrcidDSManager { protected String hdfsServerUri; - protected String hdfsOrcidDefaultPath; + protected String workingPath; private String summariesFileNameTarGz; private String outputAuthorsPath; @@ -28,10 +28,10 @@ public class OrcidDSManager { public void generateAuthors() throws Exception { Configuration conf = initConfigurationObject(); FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz); Path outputPath = new Path( hdfsServerUri - .concat(hdfsOrcidDefaultPath) + .concat(workingPath) .concat(outputAuthorsPath) .concat("authors.seq")); SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); @@ -41,7 +41,7 @@ public class OrcidDSManager { // ====== Init HDFS File System Object Configuration conf = new Configuration(); // Set FileSystem URI - conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); + conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath)); // Because of Maven conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); @@ -52,7 +52,7 @@ public class OrcidDSManager { // Get the filesystem - HDFS FileSystem fs = null; try { - fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); + fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -71,8 +71,8 @@ public class OrcidDSManager { hdfsServerUri = parser.get("hdfsServerUri"); Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); Log.info("Summaries File Name: " + summariesFileNameTarGz); outputAuthorsPath = parser.get("outputAuthorsPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java index 2e1a199da..762d8aecd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java @@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager { long startDownload = 0; Configuration conf = initConfigurationObject(); FileSystem fs = initFileSystemObject(conf); - String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName); + String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName); Path hdfsreadpath = new Path(lambdaFileUri); FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath); Path hdfsoutputPath = new Path( hdfsServerUri - .concat(hdfsOrcidDefaultPath) + .concat(workingPath) .concat(outputPath) .concat("orcid_records.seq")); @@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager { hdfsServerUri = parser.get("hdfsServerUri"); Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); + workingPath = parser.get("workingPath"); + Log.info("Default Path: " + workingPath); lambdaFileName = parser.get("lambdaFileName"); Log.info("Lambda File Name: " + lambdaFileName); outputPath = parser.get("outputPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index bf63568d8..807f52972 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; public class ActivitiesDumpReader { - private static final int MAX_XML_WORKS_PARSED = 100; - private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10; + private static final int MAX_XML_WORKS_PARSED = -1; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) throws Exception { @@ -127,7 +127,7 @@ public class ActivitiesDumpReader { Log .warn( "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); - Log.warn(e); +// Log.warn(e); } if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index 8dcee796c..041424ba9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager { private String activitiesFileNameTarGz; private String outputWorksPath; - private String workingPath; +// private String workingPath; public static void main(String[] args) throws IOException, Exception { GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); @@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager { Log.info("HDFS URI: " + hdfsServerUri); workingPath = parser.get("workingPath"); Log.info("Working Path: " + workingPath); - hdfsOrcidDefaultPath = workingPath; activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); Log.info("Activities File Name: " + activitiesFileNameTarGz); outputWorksPath = parser.get("outputWorksPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index ae1e4dae6..b0b989463 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,9 +25,11 @@ import com.google.gson.JsonElement; import com.google.gson.JsonParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; @@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaPairRDD summariesRDD = sc - .sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class); + .sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class); Dataset summariesDataset = spark .createDataset( summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), @@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks { .filter(Objects::nonNull) .toJavaRDD(); logger.info("Works enriched data created: " + enrichedWorksRDD.count()); - enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath); + enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); logger.info("Works enriched data saved"); + JavaRDD> oafPublicationRDD = enrichedWorksRDD.map(e -> { + JsonElement j = new JsonParser().parse(e._2()); + return new Tuple2<>(e._1(), (Publication) PublicationToOaf + .generatePublicationActionsFromDump(j.getAsJsonObject())); + }); + + Dataset> publicationDataset = spark + .createDataset( + oafPublicationRDD.repartition(1).rdd(), + Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class))); + publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output"); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index dc03767ec..19bfe0f30 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -172,7 +172,7 @@ public class PublicationToOaf { instance.setUrl(urls); } - final String pubDate = getPublicationDate(rootElement, "publication_date"); + final String pubDate = getPublicationDate(rootElement, "publicationDates"); if (StringUtils.isNotBlank(pubDate)) { instance.setDateofacceptance(mapStringField(pubDate, null)); } @@ -325,7 +325,12 @@ public class PublicationToOaf { private static String getPublicationDate(final JsonObject rootElement, final String jsonKey) { - final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); + JsonObject pubDateJson = null; + try { + pubDateJson = rootElement.getAsJsonObject(jsonKey); + } catch (Exception e) { + return null; + } if (pubDateJson == null) { return null; } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json index bf992b508..6f213e415 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json @@ -1,6 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, - {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json index 131c30125..b2f0fdeda 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json @@ -1,6 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, - {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json index 444e487f7..8c69b168b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json @@ -1,6 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, - {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, {"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml index df5e0e76f..a60af8b45 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -1,75 +1,9 @@ - workingPath_activities + workingPath the working dir base path - - shell_cmd_0 - wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 0 - - - shell_cmd_1 - wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 1 - - - shell_cmd_2 - wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 2 - - - shell_cmd_3 - wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 3 - - - shell_cmd_4 - wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 4 - - - shell_cmd_5 - wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 5 - - - shell_cmd_6 - wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 6 - - - shell_cmd_7 - wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 7 - - - shell_cmd_8 - wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 8 - - - shell_cmd_9 - wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 9 - - - shell_cmd_X - wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file X - @@ -80,436 +14,11 @@ - - + - + - - - - - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_0.tar.gz - -owno_doi_works/works_0.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_1} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_1.tar.gz - -owno_doi_works/works_1.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_2} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_2.tar.gz - -owno_doi_works/works_2.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_3} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_3.tar.gz - -owno_doi_works/works_3.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_4} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_4.tar.gz - -owno_doi_works/works_4.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_5} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_5.tar.gz - -owno_doi_works/works_5.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_6} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_6.tar.gz - -owno_doi_works/works_6.seq - -oewno_doi_enriched_works/ - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_7} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_7.tar.gz - -owno_doi_works/works_7.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_8} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_8.tar.gz - -owno_doi_works/works_8.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_9} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_9.tar.gz - -owno_doi_works/works_9.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_X} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_X.tar.gz - -owno_doi_works/works_X.seq - -oewno_doi_enriched_works/ - - - - - - diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml index 7a8d04187..51e00dc0f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml @@ -1,9 +1,15 @@ - + workingPath the working dir base path + + shell_cmd_0 + wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz + + the shell command that downloads and puts to hdfs orcid summaries + @@ -15,24 +21,44 @@ - - + + - + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + - - - + ${jobTracker} ${nameNode} eu.dnetlib.doiboost.orcid.OrcidDSManager - -d${workingPath}/ + -w${workingPath}/ -n${nameNode} -fORCID_2019_summaries.tar.gz - -ooutput/ + -osummaries/output/ diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml new file mode 100644 index 000000000..3068562d0 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml @@ -0,0 +1,31 @@ + + + oozie.action.sharelib.for.java + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml new file mode 100644 index 000000000..8f9a5123e --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml @@ -0,0 +1,514 @@ + + + + workingPath + the working dir base path + + + shell_cmd_0 + wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 0 + + + shell_cmd_1 + wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 1 + + + shell_cmd_2 + wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 2 + + + shell_cmd_3 + wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 3 + + + shell_cmd_4 + wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 4 + + + shell_cmd_5 + wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 5 + + + shell_cmd_6 + wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 6 + + + shell_cmd_7 + wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 7 + + + shell_cmd_8 + wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 8 + + + shell_cmd_9 + wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 9 + + + shell_cmd_X + wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file X + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_0.tar.gz + -owno_doi_works/works_0.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_1} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_1.tar.gz + -owno_doi_works/works_1.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_2} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_2.tar.gz + -owno_doi_works/works_2.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_3} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_3.tar.gz + -owno_doi_works/works_3.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_4} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_4.tar.gz + -owno_doi_works/works_4.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_5} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_5.tar.gz + -owno_doi_works/works_5.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_6} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_6.tar.gz + -owno_doi_works/works_6.seq + -oewno_doi_enriched_works/ + + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_7} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_7.tar.gz + -owno_doi_works/works_7.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_8} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_8.tar.gz + -owno_doi_works/works_8.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_9} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_9.tar.gz + -owno_doi_works/works_9.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_X} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_X.tar.gz + -owno_doi_works/works_X.seq + -oewno_doi_enriched_works/ + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml new file mode 100644 index 000000000..e77dd09c9 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml new file mode 100644 index 000000000..3362cc67b --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml @@ -0,0 +1,68 @@ + + + + workingPath + the working dir base path + + + shell_cmd_0 + wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz + + the shell command that downloads and puts to hdfs orcid summaries + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidDSManager + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_summaries.tar.gz + -osummaries/output/ + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 8b50f2d8f..5e0f91ecd 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.*; +import java.nio.file.Files; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; @@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidClientTest { final String orcidId = "0000-0001-7291-3210"; @@ -32,11 +32,20 @@ public class OrcidClientTest { String lastUpdate = "2019-09-30 00:00:00"; String shortDate = "2020-05-06 16:06:11"; -// curl -i -H "Accept: application/vnd.orcid+xml" +// curl -i -H "Accept: application/vnd.orcid+xml" // -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d' // 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record' - public String testDownloadRecord(String orcidId) throws Exception { + @Test + public void downloadTest() throws Exception { + String record = testDownloadRecord("0000-0002-2536-4498"); + File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml"); + OutputStream outStream = new FileOutputStream(f); + IOUtils.write(record.getBytes(), outStream); + System.out.println("saved to tmp"); + } + + private String testDownloadRecord(String orcidId) throws Exception { try (CloseableHttpClient client = HttpClients.createDefault()) { HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); httpGet.addHeader("Accept", "application/vnd.orcid+xml"); @@ -100,7 +109,7 @@ public class OrcidClientTest { } // @Test - public void getRecordDatestamp() throws ParseException { + private void getRecordDatestamp() throws ParseException { Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate); Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate); Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); @@ -108,7 +117,7 @@ public class OrcidClientTest { assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt)); } - public void testDate(String value) throws ParseException { + private void testDate(String value) throws ParseException { System.out.println(value.toString()); if (value.length() != 19) { value = value.substring(0, 19); @@ -118,14 +127,16 @@ public class OrcidClientTest { } // @Test - public void testModifiedDate() throws ParseException { + @Ignore + private void testModifiedDate() throws ParseException { testDate(toRetrieveDate); testDate(toNotRetrieveDate); testDate(shortDate); } // @Test - public void testReadBase64CompressedRecord() throws Exception { + @Ignore + private void testReadBase64CompressedRecord() throws Exception { final String base64CompressedRecord = IOUtils .toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64")); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java index 4d04e1a16..39f78522f 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java @@ -13,14 +13,15 @@ import com.google.gson.JsonParser; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; +import jdk.nashorn.internal.ir.annotations.Ignore; public class PublicationToOafTest { private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class); @Test -// @Ignore - public void convertOafPublicationTest() throws Exception { + @Ignore + private void convertOafPublicationTest() throws Exception { String jsonPublication = IOUtils .toString( PublicationToOafTest.class.getResourceAsStream("publication.json")); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index d426b01f1..ca91a242a 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -42,12 +42,12 @@ public class OrcidNoDoiTest { @Test @Ignore - private void readPublicationFieldsTest() + public void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); String xml = IOUtils .toString( - OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); + OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml")); if (xml == null) { logger.info("Resource not found"); diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml new file mode 100644 index 000000000..43d3b2351 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml @@ -0,0 +1,72 @@ + + + 2019-10-22T03:18:13.755Z + 2020-06-17T11:07:13.703Z + + + https://orcid.org/client/0000-0001-8607-8906 + 0000-0001-8607-8906 + orcid.org + + INSPIRE-HEP + + + Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector + + + + other-id + 1759875 + 1759875 + http://inspirehep.net/record/1759875 + self + + + doi + 10.1140/epjc/s10052-020-7907-9 + 10.1140/epjc/s10052-020-7907-9 + http://dx.doi.org/10.1140/epjc/s10052-020-7907-9 + self + + + arxiv + 1910.08819 + arXiv:1910.08819 + http://arxiv.org/abs/1910.08819 + self + + + http://inspirehep.net/record/1759875 + journal-article + + 2020 + 06 + 12 + + Eur.Phys.J.C + From c82b15b5f4817348d446fbf53f8337a5e3601085 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 28 Jul 2020 15:23:52 +0200 Subject: [PATCH 08/34] migrate configuration to ocean, fix publication dataset creation --- .../SparkGenEnrichedOrcidWorks.java | 13 ++-- .../oozie_app/config-default.xml | 31 --------- .../oozie_app/workflow.xml | 68 ++++++++++++++++--- .../orcid/xml/XMLRecordParserTest.java | 6 +- 4 files changed, 68 insertions(+), 50 deletions(-) delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index b0b989463..b24e71615 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -91,19 +91,18 @@ public class SparkGenEnrichedOrcidWorks { Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) .toJavaRDD(); - logger.info("Works enriched data created: " + enrichedWorksRDD.count()); enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); logger.info("Works enriched data saved"); - JavaRDD> oafPublicationRDD = enrichedWorksRDD.map(e -> { + JavaRDD oafPublicationRDD = enrichedWorksRDD.map(e -> { JsonElement j = new JsonParser().parse(e._2()); - return new Tuple2<>(e._1(), (Publication) PublicationToOaf - .generatePublicationActionsFromDump(j.getAsJsonObject())); - }); + return (Publication) PublicationToOaf + .generatePublicationActionsFromDump(j.getAsJsonObject()); + }).filter(p -> p != null); - Dataset> publicationDataset = spark + Dataset publicationDataset = spark .createDataset( oafPublicationRDD.repartition(1).rdd(), - Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class))); + Encoders.bean(Publication.class)); publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output"); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml deleted file mode 100644 index 3068562d0..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml +++ /dev/null @@ -1,31 +0,0 @@ - - - oozie.action.sharelib.for.java - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx4g - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml index a60af8b45..faed3104a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -1,11 +1,56 @@ - + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + workingPath the working dir base path + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -16,20 +61,25 @@ - + - + - ${jobTracker} - ${nameNode} - yarn + yarn-cluster cluster - Gen_Enriched_Orcid_Works + GenOrcidNoDoiDataset eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks - dhp-doiboost-1.2.4-SNAPSHOT.jar - --num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} -w${workingPath}/ -n${nameNode} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index 4d8237f77..5bf6f27b9 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -12,7 +12,7 @@ import eu.dnetlib.doiboost.orcid.model.WorkData; public class XMLRecordParserTest { @Test - public void testOrcidAuthorDataXMLParser() throws Exception { + private void testOrcidAuthorDataXMLParser() throws Exception { String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml")); @@ -27,7 +27,7 @@ public class XMLRecordParserTest { } @Test - public void testOrcidXMLErrorRecordParser() throws Exception { + private void testOrcidXMLErrorRecordParser() throws Exception { String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml")); @@ -40,7 +40,7 @@ public class XMLRecordParserTest { } @Test - public void testOrcidWorkDataXMLParser() throws Exception { + private void testOrcidWorkDataXMLParser() throws Exception { String xml = IOUtils .toString( From 196f36c6edd10203ff304e7cd122b3b679593618 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 30 Jul 2020 13:38:33 +0200 Subject: [PATCH 09/34] fix publication dataset creation --- .../SparkGenEnrichedOrcidWorks.java | 47 +++++-- .../orcidnodoi/oaf/PublicationToOaf.java | 117 +++++++++++++----- .../orcidnodoi/PublicationToOafTest.java | 3 +- 3 files changed, 128 insertions(+), 39 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index b24e71615..cae5a168f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -17,10 +17,12 @@ import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.gson.Gson; +import com.google.gson.GsonBuilder; import com.google.gson.JsonElement; import com.google.gson.JsonParser; @@ -93,17 +95,48 @@ public class SparkGenEnrichedOrcidWorks { .toJavaRDD(); enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); logger.info("Works enriched data saved"); - JavaRDD oafPublicationRDD = enrichedWorksRDD.map(e -> { - JsonElement j = new JsonParser().parse(e._2()); - return (Publication) PublicationToOaf - .generatePublicationActionsFromDump(j.getAsJsonObject()); - }).filter(p -> p != null); + + final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); + final LongAccumulator enrichedPublications = spark + .sparkContext() + .longAccumulator("enrichedPublications"); + final LongAccumulator errorsGeneric = spark.sparkContext().longAccumulator("errorsGeneric"); + final LongAccumulator errorsInvalidTitle = spark.sparkContext().longAccumulator("errorsInvalidTitle"); + final LongAccumulator errorsNotFoundAuthors = spark + .sparkContext() + .longAccumulator("errorsNotFoundAuthors"); + final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType"); + final PublicationToOaf publicationToOaf = new PublicationToOaf( + parsedPublications, + enrichedPublications, + errorsGeneric, + errorsInvalidTitle, + errorsNotFoundAuthors, + errorsInvalidType); + JavaRDD oafPublicationRDD = enrichedWorksRDD + .map( + e -> { + return (Publication) publicationToOaf + .generatePublicationActionsFromJson(e._2()); + }) + .filter(p -> p != null); Dataset publicationDataset = spark .createDataset( - oafPublicationRDD.repartition(1).rdd(), + oafPublicationRDD.rdd(), Encoders.bean(Publication.class)); - publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output"); + publicationDataset + .write() + .format("parquet") + .mode(SaveMode.Overwrite) + .save(workingPath + "no_doi_dataset/output"); + + logger.info("parsedPublications: " + parsedPublications.value().toString()); + logger.info("enrichedPublications: " + enrichedPublications.value().toString()); + logger.info("errorsGeneric: " + errorsGeneric.value().toString()); + logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString()); + logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); + logger.info("errorsInvalidType: " + errorsInvalidType.value().toString()); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 19bfe0f30..448fa9a74 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -3,18 +3,17 @@ package eu.dnetlib.doiboost.orcidnodoi.oaf; import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*; +import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; +import com.google.gson.*; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.schema.oaf.*; @@ -22,7 +21,7 @@ import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; import eu.dnetlib.doiboost.orcidnodoi.util.Pair; -public class PublicationToOaf { +public class PublicationToOaf implements Serializable { static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); @@ -31,6 +30,37 @@ public class PublicationToOaf { public static final String OPENAIRE_PREFIX = "openaire____"; public static final String SEPARATOR = "::"; + private final LongAccumulator parsedPublications; + private final LongAccumulator enrichedPublications; + private final LongAccumulator errorsGeneric; + private final LongAccumulator errorsInvalidTitle; + private final LongAccumulator errorsNotFoundAuthors; + private final LongAccumulator errorsInvalidType; + + public PublicationToOaf( + LongAccumulator parsedPublications, + LongAccumulator enrichedPublications, + LongAccumulator errorsGeneric, + LongAccumulator errorsInvalidTitle, + LongAccumulator errorsNotFoundAuthors, + LongAccumulator errorsInvalidType) { + this.parsedPublications = parsedPublications; + this.enrichedPublications = enrichedPublications; + this.errorsGeneric = errorsGeneric; + this.errorsInvalidTitle = errorsInvalidTitle; + this.errorsNotFoundAuthors = errorsNotFoundAuthors; + this.errorsInvalidType = errorsInvalidType; + } + + public PublicationToOaf() { + this.parsedPublications = null; + this.enrichedPublications = null; + this.errorsGeneric = null; + this.errorsInvalidTitle = null; + this.errorsNotFoundAuthors = null; + this.errorsInvalidType = null; + } + private static Map> datasources = new HashMap>() { { @@ -69,11 +99,27 @@ public class PublicationToOaf { public static final String PID_TYPES = "dnet:pid_types"; - public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { + public Oaf generatePublicationActionsFromJson(final String json) { + try { + if (parsedPublications != null) { + parsedPublications.add(1); + } + JsonElement jElement = new JsonParser().parse(json); + JsonObject jObject = jElement.getAsJsonObject(); + return generatePublicationActionsFromDump(jObject); + } catch (Throwable t) { + logger.error("creating publication: " + t.getMessage()); + if (errorsGeneric != null) { + errorsGeneric.add(1); + } + return null; + } + } + + public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { logger.debug("generatePublicationActionsFromDump ..."); - if (!isValid(rootElement/* , context */)) { - logger.error("publication not valid"); + if (!isValid(rootElement)) { return null; } @@ -122,8 +168,9 @@ public class PublicationToOaf { // Adding titles final List titles = createRepeatedField(rootElement, "titles"); if (titles == null || titles.isEmpty()) { - logger.error("titles not found"); -// context.incrementCounter("filtered", "title_not_found", 1); + if (errorsInvalidTitle != null) { + errorsInvalidTitle.add(1); + } return null; } Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); @@ -189,8 +236,9 @@ public class PublicationToOaf { publication.setInstance(Arrays.asList(instance)); } else { - logger.error("type not found"); -// context.incrementCounter("filtered", "type_not_found", 1); + if (errorsInvalidType != null) { + errorsInvalidType.add(1); + } return null; } @@ -199,17 +247,21 @@ public class PublicationToOaf { if (authors != null && authors.size() > 0) { publication.setAuthor(authors); } else { - logger.error("authors not found"); -// context.incrementCounter("filtered", "author_not_found", 1); + if (errorsNotFoundAuthors != null) { + errorsNotFoundAuthors.add(1); + } return null; } String classValue = getDefaultResulttype(cobjValue); publication .setResulttype(mapQualifier(classValue, classValue, "dnet:result_typologies", "dnet:result_typologies")); + if (enrichedPublications != null) { + enrichedPublications.add(1); + } return publication; } - public static List createAuthors(final JsonObject root) { + public List createAuthors(final JsonObject root) { final String authorsJSONFieldName = "contributors"; @@ -273,7 +325,7 @@ public class PublicationToOaf { return null; } - private static List createRepeatedField(final JsonObject rootElement, final String fieldName) { + private List createRepeatedField(final JsonObject rootElement, final String fieldName) { if (!rootElement.has(fieldName)) { return null; } @@ -291,14 +343,14 @@ public class PublicationToOaf { } } - private static String cleanField(String value) { + private String cleanField(String value) { if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') { value = value.substring(1, value.length() - 1); } return value; } - private static void settingRelevantDate(final JsonObject rootElement, + private void settingRelevantDate(final JsonObject rootElement, final Publication publication, final String jsonKey, final String dictionaryKey, @@ -322,7 +374,7 @@ public class PublicationToOaf { } } - private static String getPublicationDate(final JsonObject rootElement, + private String getPublicationDate(final JsonObject rootElement, final String jsonKey) { JsonObject pubDateJson = null; @@ -358,24 +410,27 @@ public class PublicationToOaf { return null; } - protected static boolean isValid(final JsonObject rootElement/* , final Reporter context */) { + protected boolean isValid(final JsonObject rootElement/* , final Reporter context */) { final String type = getStringValue(rootElement, "type"); if (!typologiesMapping.containsKey(type)) { logger.error("unknowntype_" + type); -// context.incrementCounter("filtered", "unknowntype_" + type, 1); + if (errorsInvalidType != null) { + errorsInvalidType.add(1); + } return false; } if (!isValidJsonArray(rootElement, "titles")) { - logger.error("invalid_title"); -// context.incrementCounter("filtered", "invalid_title", 1); + if (errorsInvalidTitle != null) { + errorsInvalidTitle.add(1); + } return false; } return true; } - private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { + private boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { if (!rootElement.has(fieldName)) { return false; } @@ -395,7 +450,7 @@ public class PublicationToOaf { return true; } - private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) { + private Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) { final Qualifier qualifier = new Qualifier(); qualifier.setClassid(classId); qualifier.setClassname(className); @@ -404,7 +459,7 @@ public class PublicationToOaf { return qualifier; } - private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, + private ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) { ExternalReference ex = new ExternalReference(); ex.setRefidentifier(extId); @@ -412,7 +467,7 @@ public class PublicationToOaf { return ex; } - private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) { + private StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) { if (value == null | StringUtils.isBlank(value)) { return null; } @@ -424,7 +479,7 @@ public class PublicationToOaf { return structuredProperty; } - private static Field mapStringField(String value, DataInfo dataInfo) { + private Field mapStringField(String value, DataInfo dataInfo) { if (value == null || StringUtils.isBlank(value)) { return null; } @@ -435,21 +490,21 @@ public class PublicationToOaf { return stringField; } - private static KeyValue createCollectedFrom() { + private KeyValue createCollectedFrom() { KeyValue cf = new KeyValue(); cf.setValue(ORCID); cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); return cf; } - private static KeyValue createHostedBy() { + private KeyValue createHostedBy() { KeyValue hb = new KeyValue(); hb.setValue("Unknown Repository"); hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); return hb; } - private static StructuredProperty mapAuthorId(String orcidId) { + private StructuredProperty mapAuthorId(String orcidId) { final StructuredProperty sp = new StructuredProperty(); sp.setValue(orcidId); final Qualifier q = new Qualifier(); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java index 39f78522f..01e26dcb4 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java @@ -27,7 +27,8 @@ public class PublicationToOafTest { PublicationToOafTest.class.getResourceAsStream("publication.json")); JsonElement j = new JsonParser().parse(jsonPublication); logger.info("json publication loaded: " + j.toString()); - Publication oafPublication = (Publication) PublicationToOaf + PublicationToOaf publicationToOaf = new PublicationToOaf(); + Publication oafPublication = (Publication) publicationToOaf .generatePublicationActionsFromDump(j.getAsJsonObject()); assertNotNull(oafPublication.getId()); assertNotNull(oafPublication.getOriginalId()); From 0377b40fbad56c0dd75fb7c8287488a4f63ceffe Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 30 Jul 2020 18:38:07 +0200 Subject: [PATCH 10/34] output to one parquet file --- .../doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 2 +- .../doiboost/orcidnodoi/oaf/PublicationToOaf.java | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index cae5a168f..dea597cbb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -123,7 +123,7 @@ public class SparkGenEnrichedOrcidWorks { Dataset publicationDataset = spark .createDataset( - oafPublicationRDD.rdd(), + oafPublicationRDD.repartition(1).rdd(), Encoders.bean(Publication.class)); publicationDataset .write() diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 448fa9a74..503df67ff 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory; import com.google.gson.*; import eu.dnetlib.dhp.common.PacePerson; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; @@ -217,6 +218,8 @@ public class PublicationToOaf implements Serializable { final List urls = createRepeatedField(rootElement, "urls"); if (urls != null && !urls.isEmpty()) { instance.setUrl(urls); + } else { + dataInfo.setInvisible(true); } final String pubDate = getPublicationDate(rootElement, "publicationDates"); @@ -508,8 +511,10 @@ public class PublicationToOaf implements Serializable { final StructuredProperty sp = new StructuredProperty(); sp.setValue(orcidId); final Qualifier q = new Qualifier(); - q.setClassid("ORCID"); - q.setClassname("ORCID"); + q.setClassid(ORCID.toLowerCase()); + q.setClassname(ORCID.toLowerCase()); + q.setSchemeid(ModelConstants.DNET_PID_TYPES); + q.setSchemename(ModelConstants.DNET_PID_TYPES); sp.setQualifier(q); return sp; } From 538f299767d433ba17681ab82f4b7a32bfb24a2c Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 14 Sep 2020 12:35:16 +0200 Subject: [PATCH 11/34] merged --- .../dhp/broker/oa/IndexNotificationsJob.java | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index cb7acb46d..792a2354a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -47,8 +47,9 @@ public class IndexNotificationsJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(IndexNotificationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); + .toString( + IndexNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); @@ -116,7 +117,8 @@ public class IndexNotificationsJob { final long date) { final List list = subscriptions .stream() - .filter(s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) + .filter( + s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) .filter(s -> verifyConditions(e.getMap(), s.conditionsAsMap())) .map(s -> generateNotification(s, e, date)) .collect(Collectors.toList()); @@ -147,15 +149,18 @@ public class IndexNotificationsJob { if (conditions.containsKey("trust") && !SubscriptionUtils - .verifyFloatRange(map.getTrust(), conditions.get("trust").get(0).getValue(), conditions.get("trust").get(0).getOtherValue())) { + .verifyFloatRange( + map.getTrust(), conditions.get("trust").get(0).getValue(), + conditions.get("trust").get(0).getOtherValue())) { return false; } if (conditions.containsKey("targetDateofacceptance") && !conditions .get("targetDateofacceptance") .stream() - .anyMatch(c -> SubscriptionUtils - .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { + .anyMatch( + c -> SubscriptionUtils + .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { return false; } From 9e8e7fe6ef24dbf6a004190cf86cbc623c8b8d21 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 15 Sep 2020 11:32:49 +0200 Subject: [PATCH 12/34] add comments --- .../java/eu/dnetlib/doiboost/orcid/model/AuthorData.java | 4 ++++ .../dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java | 6 +++++- .../dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java | 6 +++++- .../doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 5 ++++- .../eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java | 4 ++++ .../eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java | 4 ++++ .../eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java | 4 ++++ .../doiboost/orcidnodoi/model/PublicationDate.java | 4 ++++ .../dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java | 4 ++++ .../dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 5 ++++- .../doiboost/orcidnodoi/similarity/AuthorMatcher.java | 8 ++++++++ .../doiboost/orcidnodoi/util/DumpToActionsUtility.java | 4 ++++ .../doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java | 4 ++++ 13 files changed, 58 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java index 87f1f65c8..e0624509b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java @@ -3,6 +3,10 @@ package eu.dnetlib.doiboost.orcid.model; import java.io.Serializable; +/** + * This class models the data that are retrieved from orcid publication + */ + public class AuthorData implements Serializable { private String oid; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index 807f52972..d852a7023 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -20,10 +20,14 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; +/** + * This class write on hdfs one sequence file, the key is an orcid identifier and the + * value is an orcid publication in json format + */ + public class ActivitiesDumpReader { private static final int MAX_XML_WORKS_PARSED = -1; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index 041424ba9..d32e6d945 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -12,11 +12,15 @@ import org.mortbay.log.Log; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.doiboost.orcid.OrcidDSManager; +/** + * This job generates one sequence file, the key is an orcid identifier and the + * value is an orcid publication in json format + */ + public class GenOrcidAuthorWork extends OrcidDSManager { private String activitiesFileNameTarGz; private String outputWorksPath; -// private String workingPath; public static void main(String[] args) throws IOException, Exception { GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index dea597cbb..b984ee2b2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -22,7 +22,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.gson.Gson; -import com.google.gson.GsonBuilder; import com.google.gson.JsonElement; import com.google.gson.JsonParser; @@ -35,6 +34,10 @@ import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; +/** + * This spark job generates one parquet file, containing orcid publications dataset + */ + public class SparkGenEnrichedOrcidWorks { static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java index 7f7e3a10a..363cb13e6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java @@ -6,6 +6,10 @@ import com.google.gson.JsonObject; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; +/** + * This class converts an object to json and viceversa + */ + public class JsonWriter { public static String create(AuthorData authorData) { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java index 8a170de09..9a8651c85 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -5,6 +5,10 @@ import java.io.Serializable; import eu.dnetlib.doiboost.orcid.model.AuthorData; +/** + * This class models the data related to a contributor, that are retrieved from an orcid publication + */ + public class Contributor extends AuthorData implements Serializable { private String sequence; private String role; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java index 865e54ae3..7fe50ce25 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java @@ -1,6 +1,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model; +/** + * This class models the data related to external id, that are retrieved from an orcid publication + */ + public class ExternalId { private String type; private String value; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java index 9282a80ba..5f794d8eb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java @@ -1,6 +1,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model; +/** + * This class models the data related to a publication date, that are retrieved from an orcid publication + */ + public class PublicationDate { private String year; private String month; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java index 5756521e7..58f992d12 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java @@ -4,6 +4,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model; import java.io.Serializable; import java.util.List; +/** + * This class models the data that are retrieved from orcid publication + */ + public class WorkDataNoDoi implements Serializable { private String oid; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 503df67ff..4d1408470 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -22,6 +22,10 @@ import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; import eu.dnetlib.doiboost.orcidnodoi.util.Pair; +/** + * This class converts an orcid publication from json format to oaf + */ + public class PublicationToOaf implements Serializable { static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); @@ -119,7 +123,6 @@ public class PublicationToOaf implements Serializable { public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { - logger.debug("generatePublicationActionsFromDump ..."); if (!isValid(rootElement)) { return null; } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 1e4c38bef..88c84ee89 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -21,6 +21,14 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +/** + * This class is used for searching from a list of publication contributors a + * specific author making a similarity check on both name and surname of the + * author with the credit name of each contributor of the list; as soon as + * the match is found (if exist) author informations are used to enrich the + * matched contribuotr inside contributors list + */ + public class AuthorMatcher { private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java index 9b9f3c8b2..ea4e58c44 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java @@ -9,6 +9,10 @@ import org.apache.commons.lang3.StringUtils; import com.google.gson.JsonArray; import com.google.gson.JsonObject; +/** + * Utility class + */ + public class DumpToActionsUtility { private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java index ae96a322f..c5c115551 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -17,6 +17,10 @@ import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId; import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +/** + * This class is used for parsing xml data with vtd parser + */ + public class XMLRecordParserNoDoi { private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class); From fefbcfb10682728a499ad6181e83519f66b708a7 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 22 Sep 2020 10:20:25 +0200 Subject: [PATCH 13/34] dependency version moved to main pom (PR review) --- dhp-workflows/dhp-doiboost/pom.xml | 2 +- pom.xml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index e9768be7e..357a57367 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -87,7 +87,7 @@ org.apache.commons commons-text - 1.8 + ${common.text.version} diff --git a/pom.xml b/pom.xml index e88e1d51b..9897c8abe 100644 --- a/pom.xml +++ b/pom.xml @@ -669,5 +669,6 @@ 1.1 3.5.3 4.13.0 + 1.8 From a97ad20c7bd7725ee513694d9b00aae0a19b19d2 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 22 Sep 2020 10:46:34 +0200 Subject: [PATCH 14/34] exception is now propagated (PR review) --- .../java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index aa61c0117..8ebeab2e5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -48,15 +48,10 @@ public class OrcidDSManager { return conf; } - protected FileSystem initFileSystemObject(Configuration conf) { + protected FileSystem initFileSystemObject(Configuration conf) throws IOException { // Get the filesystem - HDFS FileSystem fs = null; - try { - fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } + fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf); return fs; } From ab083f9946a219396b0099f29d67d7c492eec126 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 14:02:32 +0200 Subject: [PATCH 15/34] propagate exception on parsing work (PR request) --- .../dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index d852a7023..c73e1efd1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -128,10 +128,7 @@ public class ActivitiesDumpReader { } } } catch (Exception e) { - Log - .warn( - "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); -// Log.warn(e); + throw new Exception(filename, e); } if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { @@ -143,7 +140,7 @@ public class ActivitiesDumpReader { } } } - } catch (IOException e) { + } catch (Exception e) { Log.warn("Parsing work from gzip archive: " + e.getMessage()); Log.warn(e); throw new RuntimeException(e); From c295c71ca0c77f5b3aed5817a872d9c9da77aade Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 14:07:26 +0200 Subject: [PATCH 16/34] added comment --- .../src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index 8ebeab2e5..b62ad370e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -50,6 +50,7 @@ public class OrcidDSManager { protected FileSystem initFileSystemObject(Configuration conf) throws IOException { // Get the filesystem - HDFS + // if there is an exception, it will be propagate FileSystem fs = null; fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf); return fs; From c3114ba0aeaada891ff13de4c3e4f6469b40ba99 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 14:21:31 +0200 Subject: [PATCH 17/34] replaced null as return value with a more safe empty string --- .../dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index b984ee2b2..24f0f7a87 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -167,6 +167,6 @@ public class SparkGenEnrichedOrcidWorks { return name.getAsString(); } } - return null; + return new String(""); } } From 846ba3087310024cf3e28fb2c88f10259323f5f6 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 14:36:18 +0200 Subject: [PATCH 18/34] if typologies mapping fails, an exception will be propagated --- .../eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 4d1408470..deb83723b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -97,8 +97,8 @@ public class PublicationToOaf implements Serializable { .getResourceAsStream( "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json")); typologiesMapping = new Gson().fromJson(tt, Map.class); - } catch (final Exception e) { - logger.error("loading typologies", e); + } catch (Exception e) { + throw new RuntimeException("loading typologies", e); } } From c58db1c8eab65e0e4ad7d27b7bbc5f815961f050 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 15:11:02 +0200 Subject: [PATCH 19/34] added filter on null value after map function --- .../eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index deb83723b..63979d1af 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -185,6 +185,7 @@ public class PublicationToOaf implements Serializable { .map(t -> { return mapStructuredProperty(t, q, null); }) + .filter(s -> s!=null) .collect(Collectors.toList())); // Adding identifier final String id = getStringValue(rootElement, "id"); @@ -376,7 +377,7 @@ public class PublicationToOaf implements Serializable { .map(r -> { return mapStructuredProperty(r, q, null); }) - .collect(Collectors.toList())); + .filter(s -> s!=null).collect(Collectors.toList())); } } From 1139d6568d392b61e97c12ce1ceac9b2a59b42e2 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 15:32:26 +0200 Subject: [PATCH 20/34] replaced null value with a more safe empty string as return value --- .../dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 8 +++++++- .../doiboost/orcidnodoi/util/DumpToActionsUtility.java | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 63979d1af..136356161 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -166,7 +166,13 @@ public class PublicationToOaf implements Serializable { // Adding source final String source = getStringValue(rootElement, "sourceName"); if (StringUtils.isNotBlank(source)) { - publication.setSource(Arrays.asList(mapStringField(source, null))); + Field sourceField = mapStringField(source, null); + if (sourceField==null) { + publication.setSource(null); + } + else { + publication.setSource(Arrays.asList(sourceField)); + } } // Adding titles diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java index ea4e58c44..8096c4e8e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java @@ -20,7 +20,7 @@ public class DumpToActionsUtility { public static String getStringValue(final JsonObject root, final String key) { if (root.has(key) && !root.get(key).isJsonNull()) return root.get(key).getAsString(); - return null; + return new String(""); } public static List getArrayValues(final JsonObject root, final String key) { From a38ab57062955b425c1fec90875c16a6954fb83d Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 15:43:50 +0200 Subject: [PATCH 21/34] let run test methods --- .../eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index ca91a242a..bf5aba99b 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -41,7 +41,7 @@ public class OrcidNoDoiTest { String orcidIdA = "0000-0003-2760-1191"; @Test - @Ignore +// @Ignore public void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); @@ -95,7 +95,7 @@ public class OrcidNoDoiTest { } @Test - @Ignore +// @Ignore private void authorMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; From b0290dbcb7728da0b447d38953702ab681bb1ce0 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 16:20:46 +0200 Subject: [PATCH 22/34] moved all dependencies version to main pom.xml --- dhp-workflows/dhp-doiboost/pom.xml | 4 ++-- pom.xml | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 357a57367..b81299cd1 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -14,7 +14,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -51,7 +51,7 @@ org.apache.httpcomponents httpclient - 4.3.4 + ${org.apache.httpcomponents.version} eu.dnetlib.dhp diff --git a/pom.xml b/pom.xml index 9897c8abe..bae53fcc0 100644 --- a/pom.xml +++ b/pom.xml @@ -670,5 +670,7 @@ 3.5.3 4.13.0 1.8 + 4.3.4 + 4.0.1 From 210a50e4f486c195b627d462e64d8ee10c3dc70e Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 22 Oct 2020 16:24:42 +0200 Subject: [PATCH 23/34] replaced null value --- .../doiboost/orcidnodoi/oaf/PublicationToOaf.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 136356161..ece59c3f1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -167,10 +167,9 @@ public class PublicationToOaf implements Serializable { final String source = getStringValue(rootElement, "sourceName"); if (StringUtils.isNotBlank(source)) { Field sourceField = mapStringField(source, null); - if (sourceField==null) { + if (sourceField == null) { publication.setSource(null); - } - else { + } else { publication.setSource(Arrays.asList(sourceField)); } } @@ -191,7 +190,7 @@ public class PublicationToOaf implements Serializable { .map(t -> { return mapStructuredProperty(t, q, null); }) - .filter(s -> s!=null) + .filter(s -> s != null) .collect(Collectors.toList())); // Adding identifier final String id = getStringValue(rootElement, "id"); @@ -383,7 +382,8 @@ public class PublicationToOaf implements Serializable { .map(r -> { return mapStructuredProperty(r, q, null); }) - .filter(s -> s!=null).collect(Collectors.toList())); + .filter(s -> s != null) + .collect(Collectors.toList())); } } From 6bc7dbeca76e94f6cb00725aa50753d61d122952 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 6 Nov 2020 13:47:50 +0100 Subject: [PATCH 24/34] first version of dataset successful generated from orcid dump 2020 --- dhp-workflows/dhp-doiboost/pom.xml | 2 - .../doiboost/orcid/OrcidDSManager.java | 2 +- .../orcidnodoi/ActivitiesDumpReader.java | 4 +- .../orcidnodoi/GenOrcidAuthorWork.java | 2 +- .../SparkGenEnrichedOrcidWorks.java | 8 +- .../orcidnodoi/similarity/AuthorMatcher.java | 4 +- .../orcidnodoi/xml/XMLRecordParserNoDoi.java | 53 +++--- ... => gen_orcid_authors_from_summaries.json} | 0 ...en_orcid_works-no-doi_from_activities.json | 7 + .../orcid/oozie_app/config-default.xml | 42 ----- .../dhp/doiboost/orcid/oozie_app/workflow.xml | 67 -------- .../oozie_app/config-default.xml | 2 +- .../orcid_activities/oozie_app/workflow.xml | 156 +++++++++++------- .../oozie_app/config-default.xml | 4 + .../orcid_summaries/oozie_app/workflow.xml | 14 +- .../oozie_app/workflow.xml | 4 +- .../doiboost/orcid/OrcidClientTest.java | 4 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 60 ++++++- ..._work_0000-0003-2760-1191_contributors.xml | 101 ++++++++++++ pom.xml | 12 ++ 20 files changed, 320 insertions(+), 228 deletions(-) rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{create_orcid_authors_data.json => gen_orcid_authors_from_summaries.json} (100%) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{gen_enriched_orcid_works => orcidnodoi}/oozie_app/workflow.xml (95%) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index b81299cd1..624dd7b31 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -51,7 +51,6 @@ org.apache.httpcomponents httpclient - ${org.apache.httpcomponents.version} eu.dnetlib.dhp @@ -87,7 +86,6 @@ org.apache.commons commons-text - ${common.text.version} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index b62ad370e..bf13db021 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -62,7 +62,7 @@ public class OrcidDSManager { .toString( OrcidDSManager.class .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); + "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json"))); parser.parseArgument(args); hdfsServerUri = parser.get("hdfsServerUri"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index c73e1efd1..c2cfafd87 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -73,7 +73,7 @@ public class ActivitiesDumpReader { SequenceFile.Writer.valueClass(Text.class))) { while ((entry = tais.getNextTarEntry()) != null) { String filename = entry.getName(); - + StringBuffer buffer = new StringBuffer(); try { if (entry.isDirectory() || !filename.contains("works")) { @@ -83,7 +83,7 @@ public class ActivitiesDumpReader { BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from // tarInput String line; - StringBuffer buffer = new StringBuffer(); + buffer = new StringBuffer(); while ((line = br.readLine()) != null) { buffer.append(line); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index d32e6d945..d3e9aeaef 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager { .toString( GenOrcidAuthorWork.class .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json"))); parser.parseArgument(args); hdfsServerUri = parser.get("hdfsServerUri"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 24f0f7a87..691ca3eee 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaPairRDD summariesRDD = sc - .sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class); + .sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class); Dataset summariesDataset = spark .createDataset( summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), @@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks { Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) .toJavaRDD(); - enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); - logger.info("Works enriched data saved"); +// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); + logger.info("Enriched works RDD ready."); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); final LongAccumulator enrichedPublications = spark @@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks { .write() .format("parquet") .mode(SaveMode.Overwrite) - .save(workingPath + "no_doi_dataset/output"); + .save(workingPath + outputEnrichedWorksPath); logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString()); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 88c84ee89..6a1468f4c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.text.Normalizer; import java.util.*; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,7 +41,7 @@ public class AuthorMatcher { int matchCounter = 0; List matchCounters = Arrays.asList(matchCounter); Contributor contributor = null; - contributors.forEach(c -> { + contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> { if (simpleMatch(c.getCreditName(), author.getName()) || simpleMatch(c.getCreditName(), author.getSurname()) || simpleMatch(c.getCreditName(), author.getOtherName())) { @@ -54,6 +55,7 @@ public class AuthorMatcher { Optional optCon = contributors .stream() .filter(c -> c.isSimpleMatch()) + .filter(c -> !StringUtils.isBlank(c.getCreditName())) .map(c -> { c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); return c; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java index c5c115551..f4b093402 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi { private static List getContributors(VTDGen vg, VTDNav vn, AutoPilot ap) throws XPathParseException, NavException, XPathEvalException { List contributors = new ArrayList(); - int nameIndex = 0; - ap.selectXPath("//work:contributor/work:credit-name"); + ap.selectXPath("//work:contributors/work:contributor"); while (ap.evalXPath() != -1) { Contributor contributor = new Contributor(); - int t = vn.getText(); - if (t >= 0) { - contributor.setCreditName(vn.toNormalizedString(t)); - contributors.add(nameIndex, contributor); - nameIndex++; + if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) { + int val = vn.getText(); + if (val != -1) { + contributor.setCreditName(vn.toNormalizedString(val)); + } + vn.toElement(VTDNav.PARENT); } - } - if (contributors.size() == 0) { - return contributors; - } - - int sequenceIndex = 0; - ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); - while (ap.evalXPath() != -1) { - int t = vn.getText(); - if (t >= 0) { - contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t)); - sequenceIndex++; - } - } - - int roleIndex = 0; - ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role"); - while (ap.evalXPath() != -1) { - int t = vn.getText(); - if (t >= 0) { - contributors.get(roleIndex).setRole(vn.toNormalizedString(t)); - roleIndex++; + if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) { + if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) { + int val = vn.getText(); + if (val != -1) { + contributor.setSequence(vn.toNormalizedString(val)); + } + vn.toElement(VTDNav.PARENT); + } + if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) { + int val = vn.getText(); + if (val != -1) { + contributor.setRole(vn.toNormalizedString(val)); + } + vn.toElement(VTDNav.PARENT); + } + vn.toElement(VTDNav.PARENT); } + contributors.add(contributor); } return contributors; } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json new file mode 100644 index 000000000..c3a8f92ec --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json @@ -0,0 +1,7 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, + {"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, + {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml deleted file mode 100644 index fe14bb8cb..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - jobTracker - hadoop-rm3.garr-pa1.d4science.org:8032 - - - nameNode - hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - hive_metastore_uris - thrift://hadoop-edge2.garr-pa1.d4science.org:9083 - - - spark2YarnHistoryServerAddress - http://hadoop-edge1.garr-pa1.d4science.org:18089/ - - - spark2EventLogDir - /user/spark/spark2ApplicationHistory - - - spark2ExtraListeners - "com.cloudera.spark.lineage.NavigatorAppListener" - - - spark2SqlQueryExecutionListeners - "com.cloudera.spark.lineage.NavigatorQueryListener" - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml deleted file mode 100644 index 51e00dc0f..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml +++ /dev/null @@ -1,67 +0,0 @@ - - - - workingPath - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz - - the shell command that downloads and puts to hdfs orcid summaries - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidDSManager - -w${workingPath}/ - -n${nameNode} - -fORCID_2019_summaries.tar.gz - -osummaries/output/ - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml index 3068562d0..05fe6d014 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml @@ -9,7 +9,7 @@ oozie.launcher.mapreduce.map.java.opts - -Xmx4g + -Xmx2g jobTracker diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml index 8f9a5123e..ea4d33296 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + workingPath @@ -6,70 +6,70 @@ shell_cmd_0 - wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz + wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz the shell command that downloads and puts to hdfs orcid activity file 0 shell_cmd_1 - wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz + wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz the shell command that downloads and puts to hdfs orcid activity file 1 shell_cmd_2 - wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz + wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz the shell command that downloads and puts to hdfs orcid activity file 2 shell_cmd_3 - wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz + wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz the shell command that downloads and puts to hdfs orcid activity file 3 - + shell_cmd_4 - wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz + wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz the shell command that downloads and puts to hdfs orcid activity file 4 - + shell_cmd_5 - wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz + wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz the shell command that downloads and puts to hdfs orcid activity file 5 - + shell_cmd_6 - wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz + wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz the shell command that downloads and puts to hdfs orcid activity file 6 shell_cmd_7 - wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz + wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz the shell command that downloads and puts to hdfs orcid activity file 7 shell_cmd_8 - wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz + wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz the shell command that downloads and puts to hdfs orcid activity file 8 shell_cmd_9 - wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz + wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz the shell command that downloads and puts to hdfs orcid activity file 9 - + shell_cmd_X - wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz + wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz the shell command that downloads and puts to hdfs orcid activity file X - + @@ -82,11 +82,11 @@ - + - + @@ -102,8 +102,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))} @@ -118,7 +118,7 @@ ${shell_cmd_0} - + @@ -129,7 +129,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_0.tar.gz + -fORCID_2020_10_activites_0.tar.gz -owno_doi_works/works_0.seq -oewno_doi_enriched_works/ @@ -139,8 +139,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))} @@ -155,7 +155,7 @@ ${shell_cmd_1} - + @@ -166,7 +166,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_1.tar.gz + -fORCID_2020_10_activites_1.tar.gz -owno_doi_works/works_1.seq -oewno_doi_enriched_works/ @@ -176,8 +176,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))} @@ -192,7 +192,7 @@ ${shell_cmd_2} - + @@ -203,7 +203,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_2.tar.gz + -fORCID_2020_10_activites_2.tar.gz -owno_doi_works/works_2.seq -oewno_doi_enriched_works/ @@ -213,8 +213,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))} @@ -229,7 +229,7 @@ ${shell_cmd_3} - + @@ -240,7 +240,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_3.tar.gz + -fORCID_2020_10_activites_3.tar.gz -owno_doi_works/works_3.seq -oewno_doi_enriched_works/ @@ -250,8 +250,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))} @@ -266,7 +266,7 @@ ${shell_cmd_4} - + @@ -277,7 +277,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_4.tar.gz + -fORCID_2020_10_activites_4.tar.gz -owno_doi_works/works_4.seq -oewno_doi_enriched_works/ @@ -287,8 +287,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))} @@ -303,7 +303,7 @@ ${shell_cmd_5} - + @@ -314,7 +314,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_5.tar.gz + -fORCID_2020_10_activites_5.tar.gz -owno_doi_works/works_5.seq -oewno_doi_enriched_works/ @@ -324,8 +324,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))} @@ -340,7 +340,7 @@ ${shell_cmd_6} - + @@ -351,7 +351,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_6.tar.gz + -fORCID_2020_10_activites_6.tar.gz -owno_doi_works/works_6.seq -oewno_doi_enriched_works/ @@ -362,8 +362,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))} @@ -378,7 +378,7 @@ ${shell_cmd_7} - + @@ -389,7 +389,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_7.tar.gz + -fORCID_2020_10_activites_7.tar.gz -owno_doi_works/works_7.seq -oewno_doi_enriched_works/ @@ -399,8 +399,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))} @@ -415,7 +415,7 @@ ${shell_cmd_8} - + @@ -426,7 +426,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_8.tar.gz + -fORCID_2020_10_activites_8.tar.gz -owno_doi_works/works_8.seq -oewno_doi_enriched_works/ @@ -436,8 +436,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))} @@ -452,7 +452,7 @@ ${shell_cmd_9} - + @@ -463,7 +463,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_9.tar.gz + -fORCID_2020_10_activites_9.tar.gz -owno_doi_works/works_9.seq -oewno_doi_enriched_works/ @@ -473,8 +473,8 @@ - - ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))} + + ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))} @@ -489,7 +489,7 @@ ${shell_cmd_X} - + @@ -500,7 +500,7 @@ eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork -w${workingPath}/ -n${nameNode} - -fORCID_2019_activites_X.tar.gz + -fORCID_2020_10_activites_X.tar.gz -owno_doi_works/works_X.seq -oewno_doi_enriched_works/ @@ -508,7 +508,35 @@ + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml index e77dd09c9..e1829e847 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml @@ -19,4 +19,8 @@ oozie.launcher.mapreduce.user.classpath.first true + + oozie.launcher.mapreduce.map.java.opts + -Xmx16g + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml index 3362cc67b..8517f35ee 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + workingPath @@ -6,7 +6,7 @@ shell_cmd_0 - wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz + wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz the shell command that downloads and puts to hdfs orcid summaries @@ -21,8 +21,8 @@ - - + + @@ -31,7 +31,7 @@ - ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} + ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))} @@ -57,8 +57,8 @@ eu.dnetlib.doiboost.orcid.OrcidDSManager -w${workingPath}/ -n${nameNode} - -fORCID_2019_summaries.tar.gz - -osummaries/output/ + -fORCID_2020_10_summaries.tar.gz + -oauthors/ diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml similarity index 95% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index faed3104a..6cec48a6d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -59,7 +59,7 @@ - + @@ -85,7 +85,7 @@ -n${nameNode} -f- -owno_doi_works/ - -oewno_doi_enriched_works/output + -oewno_doi_dataset diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 5e0f91ecd..774475626 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -38,8 +38,8 @@ public class OrcidClientTest { @Test public void downloadTest() throws Exception { - String record = testDownloadRecord("0000-0002-2536-4498"); - File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml"); + String record = testDownloadRecord("0000-0001-6163-2042"); + File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml"); OutputStream outStream = new FileOutputStream(f); IOUtils.write(record.getBytes(), outStream); System.out.println("saved to tmp"); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index bf5aba99b..fa2980ac4 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -2,15 +2,20 @@ package eu.dnetlib.doiboost.orcidnodoi.xml; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.text.Normalizer; import java.util.*; +import javax.validation.constraints.AssertTrue; + import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.junit.jupiter.api.Test; +import org.mortbay.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +46,6 @@ public class OrcidNoDoiTest { String orcidIdA = "0000-0003-2760-1191"; @Test -// @Ignore public void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); @@ -95,8 +99,7 @@ public class OrcidNoDoiTest { } @Test -// @Ignore - private void authorMatchTest() throws Exception { + public void authorMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); @@ -121,9 +124,60 @@ public class OrcidNoDoiTest { logger.error("parsing xml", e); } assertNotNull(workData); + + Contributor a = workData.getContributors().get(0); + assertTrue(a.getCreditName().equals("Abdel-Dayem K")); + AuthorMatcher.match(author, workData.getContributors()); GsonBuilder builder = new GsonBuilder(); Gson gson = builder.create(); logger.info(gson.toJson(workData)); + + assertTrue(workData.getContributors().size() == 6); + Contributor c = workData.getContributors().get(0); + assertTrue(c.getOid().equals("0000-0003-2760-1191")); + assertTrue(c.getName().equals("Khairy")); + assertTrue(c.getSurname().equals("Abdel Dayem")); + assertTrue(c.getCreditName().equals("Abdel-Dayem K")); + } + + @Test + public void readContributorsTest() + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + logger.info("running loadPublicationFieldsTest ...."); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml")); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData.getContributors()); + assertTrue(workData.getContributors().size() == 5); + assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName())); + assertTrue(workData.getContributors().get(0).getSequence().equals("seq0")); + assertTrue(workData.getContributors().get(0).getRole().equals("role0")); + assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence())); + assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole())); + assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2")); + assertTrue(workData.getContributors().get(2).getSequence().equals("seq2")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole())); + assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence())); + assertTrue(workData.getContributors().get(3).getRole().equals("role3")); + assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName())); + assertTrue(workData.getContributors().get(4).getSequence().equals("seq4")); + assertTrue(workData.getContributors().get(4).getRole().equals("role4")); } } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml new file mode 100644 index 000000000..26e64aeda --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml @@ -0,0 +1,101 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + + seq0 + role0 + + + + creditname1 + + + creditname2 + + seq2 + + + + + creditname3 + + + role3 + + + + + + seq4 + role4 + + + + diff --git a/pom.xml b/pom.xml index d64de01ac..3629e2f1b 100644 --- a/pom.xml +++ b/pom.xml @@ -458,6 +458,18 @@ ${jsonschemagenerator.version} + + org.apache.commons + commons-text + ${common.text.version} + + + + org.apache.httpcomponents + httpclient + ${org.apache.httpcomponents.version} + + From 1513174d7ec367222c063ba47095ee7ca4897e99 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 10 Nov 2020 11:44:55 +0100 Subject: [PATCH 25/34] added further test case --- .../SparkGenEnrichedOrcidWorks.java | 2 +- .../orcidnodoi/similarity/AuthorMatcher.java | 50 +++-- .../orcidnodoi/xml/OrcidNoDoiTest.java | 181 ++++++++++++++++-- .../xml/activity_work_0000-0003-2760-1191.xml | 2 +- 4 files changed, 202 insertions(+), 33 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 691ca3eee..40cd212da 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -96,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks { Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) .toJavaRDD(); -// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); + enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/"); logger.info("Enriched works RDD ready."); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 6a1468f4c..2f86820fb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -33,7 +33,7 @@ import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; public class AuthorMatcher { private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); - private static final Double threshold = 0.8; + public static final Double threshold = 0.8; public static void match(AuthorData author, List contributors) throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { @@ -41,16 +41,35 @@ public class AuthorMatcher { int matchCounter = 0; List matchCounters = Arrays.asList(matchCounter); Contributor contributor = null; - contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> { - if (simpleMatch(c.getCreditName(), author.getName()) || - simpleMatch(c.getCreditName(), author.getSurname()) || - simpleMatch(c.getCreditName(), author.getOtherName())) { - matchCounters.set(0, matchCounters.get(0) + 1); - c.setSimpleMatch(true); - } - }); + contributors + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .forEach(c -> { + if (simpleMatch(c.getCreditName(), author.getName()) || + simpleMatch(c.getCreditName(), author.getSurname()) || + simpleMatch(c.getCreditName(), author.getOtherName())) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); if (matchCounters.get(0) == 1) { updateAuthorsSimpleMatch(contributors, author); + } else if (matchCounters.get(0) == 0) { + Optional optCon = contributors + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .map(c -> { + c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); + return c; + }) + .filter(c -> c.getScore() >= threshold) + .max(Comparator.comparing(c -> c.getScore())); + Contributor bestMatchContributor = null; + if (optCon.isPresent()) { + bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + updateAuthorsSimilarityMatch(contributors, author); + } } else if (matchCounters.get(0) > 1) { Optional optCon = contributors .stream() @@ -68,19 +87,18 @@ public class AuthorMatcher { bestMatchContributor.setBestMatch(true); updateAuthorsSimilarityMatch(contributors, author); } - } } - private static boolean simpleMatch(String name, String searchValue) { + public static boolean simpleMatch(String name, String searchValue) { if (searchValue == null) { return false; } return normalize(name).contains(normalize(searchValue)); } - private static Double bestMatch(String authorSurname, String authorName, String contributor) { + public static Double bestMatch(String authorSurname, String authorName, String contributor) { String[] contributorSplitted = contributor.split(" "); if (contributorSplitted.length == 0) { return 0.0; @@ -106,7 +124,7 @@ public class AuthorMatcher { return sm2; } - private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { + public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); return score; } @@ -115,7 +133,7 @@ public class AuthorMatcher { return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); } - private static String normalize(final String s) { + public static String normalize(final String s) { if (s == null) { return new String(""); } @@ -140,7 +158,7 @@ public class AuthorMatcher { return surname + " " + name; } - private static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { + public static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { contributors.forEach(c -> { if (c.isSimpleMatch()) { c.setName(author.getName()); @@ -151,7 +169,7 @@ public class AuthorMatcher { updateRanks(contributors); } - private static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { + public static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { contributors .stream() .filter(c -> c.isBestMatch()) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index fa2980ac4..c2c4ed5e1 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -38,12 +38,9 @@ public class OrcidNoDoiTest { private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class); - String nameA = "Khairy"; - String surnameA = "Abdel Dayem"; - String otherNameA = "Dayem MKA"; - String nameB = "K"; - String surnameB = "Abdel-Dayem"; - String orcidIdA = "0000-0003-2760-1191"; + static String nameA = "Khairy"; + static String surnameA = "Abdel Dayem"; + static String orcidIdA = "0000-0003-2760-1191"; @Test public void readPublicationFieldsTest() @@ -99,7 +96,7 @@ public class OrcidNoDoiTest { } @Test - public void authorMatchTest() throws Exception { + public void authorDoubleMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); @@ -129,16 +126,8 @@ public class OrcidNoDoiTest { assertTrue(a.getCreditName().equals("Abdel-Dayem K")); AuthorMatcher.match(author, workData.getContributors()); - GsonBuilder builder = new GsonBuilder(); - Gson gson = builder.create(); - logger.info(gson.toJson(workData)); assertTrue(workData.getContributors().size() == 6); - Contributor c = workData.getContributors().get(0); - assertTrue(c.getOid().equals("0000-0003-2760-1191")); - assertTrue(c.getName().equals("Khairy")); - assertTrue(c.getSurname().equals("Abdel Dayem")); - assertTrue(c.getCreditName().equals("Abdel-Dayem K")); } @Test @@ -180,4 +169,166 @@ public class OrcidNoDoiTest { assertTrue(workData.getContributors().get(4).getSequence().equals("seq4")); assertTrue(workData.getContributors().get(4).getRole().equals("role4")); } + + @Test + public void authorSimpleMatchTest() throws Exception { + String orcidWork = "activity_work_0000-0002-5982-8983.xml"; + AuthorData author = new AuthorData(); + author.setName("Parkhouse"); + author.setSurname("H."); + author.setOid("0000-0002-5982-8983"); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData); + + Contributor a = workData.getContributors().get(0); + assertTrue(a.getCreditName().equals("Parkhouse, H.")); + + AuthorMatcher.match(author, workData.getContributors()); + + assertTrue(workData.getContributors().size() == 2); + Contributor c = workData.getContributors().get(0); + assertTrue(c.getOid().equals("0000-0002-5982-8983")); + assertTrue(c.getName().equals("Parkhouse")); + assertTrue(c.getSurname().equals("H.")); + assertTrue(c.getCreditName().equals("Parkhouse, H.")); + } + + @Test + public void match() { + + AuthorData author = new AuthorData(); + author.setName("Joe"); + author.setSurname("Dodge"); + author.setOid("0000-1111-2222-3333"); + Contributor contributor = new Contributor(); + contributor.setCreditName("Joe Dodge"); + List contributors = Arrays.asList(contributor); + AuthorMatcher am = new AuthorMatcher(); + int matchCounter = 0; + List matchCounters = Arrays.asList(matchCounter); + contributors + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .forEach(c -> { + if (am.simpleMatch(c.getCreditName(), author.getName()) || + am.simpleMatch(c.getCreditName(), author.getSurname()) || + am.simpleMatch(c.getCreditName(), author.getOtherName())) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); + + assertTrue(matchCounters.get(0) == 1); + am.updateAuthorsSimpleMatch(contributors, author); + assertTrue(contributors.get(0).getName().equals("Joe")); + assertTrue(contributors.get(0).getSurname().equals("Dodge")); + assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge")); + assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333")); + + AuthorData authorX = new AuthorData(); + authorX.setName(nameA); + authorX.setSurname(surnameA); + authorX.setOid(orcidIdA); + Contributor contributorA = new Contributor(); + contributorA.setCreditName("Abdel-Dayem Khai"); + Contributor contributorB = new Contributor(); + contributorB.setCreditName("Abdel-Dayem Fake"); + List contributorList = new ArrayList<>(); + contributorList.add(contributorA); + contributorList.add(contributorB); + int matchCounter2 = 0; + List matchCounters2 = Arrays.asList(matchCounter2); + contributorList + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .forEach(c -> { + if (am.simpleMatch(c.getCreditName(), authorX.getName()) || + am.simpleMatch(c.getCreditName(), authorX.getSurname()) || + am.simpleMatch(c.getCreditName(), authorX.getOtherName())) { + int currentCounter = matchCounters2.get(0); + currentCounter += 1; + matchCounters2.set(0, currentCounter); + c.setSimpleMatch(true); + } + }); + + assertTrue(matchCounters2.get(0) == 2); + assertTrue(contributorList.get(0).isSimpleMatch()); + assertTrue(contributorList.get(1).isSimpleMatch()); + + Optional optCon = contributorList + .stream() + .filter(c -> c.isSimpleMatch()) + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .map(c -> { + c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName())); + return c; + }) + .filter(c -> c.getScore() >= AuthorMatcher.threshold) + .max(Comparator.comparing(c -> c.getScore())); + assertTrue(optCon.isPresent()); + + final Contributor bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai")); + assertTrue(contributorList.get(0).isBestMatch()); + assertTrue(!contributorList.get(1).isBestMatch()); + am.updateAuthorsSimilarityMatch(contributorList, authorX); + assertTrue(contributorList.get(0).getName().equals(nameA)); + assertTrue(contributorList.get(0).getSurname().equals(surnameA)); + assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai")); + assertTrue(contributorList.get(0).getOid().equals(orcidIdA)); + assertTrue(StringUtils.isBlank(contributorList.get(1).getOid())); + } + + @Test + public void authorBestMatchTest() throws Exception { + String name = "Khairy"; + String surname = "Abdel Dayem"; + String orcidWork = "activity_work_0000-0003-2760-1191.xml"; + AuthorData author = new AuthorData(); + author.setName(name); + author.setSurname(surname); + author.setOid(orcidIdA); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + AuthorMatcher.match(author, workData.getContributors()); + assertTrue(workData.getContributors().size() == 5); + List c = workData.getContributors(); + assertTrue(c.get(0).getName().equals(name)); + assertTrue(c.get(0).getSurname().equals(surname)); + assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye")); + assertTrue(c.get(0).getOid().equals(orcidIdA)); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml index 485f4f8e8..83752b145 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml @@ -68,7 +68,7 @@ http://europepmc.org/abstract/med/27899851 - Abdel-Dayem K + Khair Abde Daye first author From 1f861f2b0de77d6a693c5d5144696409c32592a7 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 11 Nov 2020 17:38:50 +0100 Subject: [PATCH 26/34] now wf output is a sequence file with the format seq("eu.dnetlib.dhp.schema.oaf.Publication",eu.dnetlib.dhp.schema.action.AtomicActions) --- .../SparkGenEnrichedOrcidWorks.java | 62 ++++++++++--------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 40cd212da..7f715fa7d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -1,14 +1,21 @@ package eu.dnetlib.doiboost.orcidnodoi; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.IOException; -import java.util.Objects; -import java.util.Optional; - +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.doiboost.orcid.json.JsonHelper; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -16,24 +23,17 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; -import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; +import java.io.IOException; +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + /** * This spark job generates one parquet file, containing orcid publications dataset */ @@ -42,6 +42,8 @@ public class SparkGenEnrichedOrcidWorks { static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static void main(String[] args) throws IOException, Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -58,6 +60,7 @@ public class SparkGenEnrichedOrcidWorks { final String workingPath = parser.get("workingPath"); final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath"); final String outputWorksPath = parser.get("outputWorksPath"); + final String hdfsServerUri = parser.get("hdfsServerUri"); SparkConf conf = new SparkConf(); runWithSparkSession( @@ -96,7 +99,6 @@ public class SparkGenEnrichedOrcidWorks { Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) .toJavaRDD(); - enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/"); logger.info("Enriched works RDD ready."); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); @@ -124,15 +126,17 @@ public class SparkGenEnrichedOrcidWorks { }) .filter(p -> p != null); - Dataset publicationDataset = spark - .createDataset( - oafPublicationRDD.repartition(1).rdd(), - Encoders.bean(Publication.class)); - publicationDataset - .write() - .format("parquet") - .mode(SaveMode.Overwrite) - .save(workingPath + outputEnrichedWorksPath); + oafPublicationRDD + .mapToPair( + p -> new Tuple2<>(p.getClass().toString(), + OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, (Publication) p)))) + .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) + .saveAsNewAPIHadoopFile( + workingPath.concat(outputEnrichedWorksPath), + Text.class, + Text.class, + SequenceFileOutputFormat.class, + sc.hadoopConfiguration()); logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString()); From 13f28fa225d248e080c13d99694c3069826b3184 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 12 Nov 2020 17:43:32 +0100 Subject: [PATCH 27/34] moved AuthorData to dhp-schemas; added other names to author data --- .../dnetlib/dhp/schema/orcid}/AuthorData.java | 18 +- .../orcid/SparkGenerateDoiAuthorList.java | 5 +- .../doiboost/orcid/SummariesDecompressor.java | 7 +- .../doiboost/orcid/xml/XMLRecordParser.java | 8 +- .../SparkGenEnrichedOrcidWorks.java | 38 +- .../doiboost/orcidnodoi/json/JsonWriter.java | 19 +- .../orcidnodoi/model/Contributor.java | 2 +- .../orcidnodoi/similarity/AuthorMatcher.java | 11 +- .../oozie_app/config-default.xml | 2 +- .../orcid/xml/XMLRecordParserTest.java | 21 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 12 +- .../orcid/xml/record_8888-8888-8888-8880.xml | 770 ++++++++++++++++++ .../summary_0000-0001-5109-1000_othername.xml | 196 +++++ 13 files changed, 1053 insertions(+), 56 deletions(-) rename {dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid}/AuthorData.java (71%) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorData.java similarity index 71% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorData.java index e0624509b..6c94cdb13 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorData.java @@ -1,7 +1,10 @@ -package eu.dnetlib.doiboost.orcid.model; +package eu.dnetlib.dhp.schema.orcid; import java.io.Serializable; +import java.util.List; + +import com.google.common.collect.Lists; /** * This class models the data that are retrieved from orcid publication @@ -13,8 +16,8 @@ public class AuthorData implements Serializable { private String name; private String surname; private String creditName; - private String otherName; private String errorCode; + private List otherNames; public String getErrorCode() { return errorCode; @@ -56,11 +59,14 @@ public class AuthorData implements Serializable { this.oid = oid; } - public String getOtherName() { - return otherName; + public List getOtherNames() { + return otherNames; } - public void setOtherName(String otherName) { - this.otherName = otherName; + public void setOtherNames(List otherNames) { + if (this.otherNames == null) { + this.otherNames = Lists.newArrayList(); + } + this.otherNames = otherNames; } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java index b4239bba2..011c153ec 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java @@ -13,9 +13,6 @@ import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -33,7 +30,7 @@ import com.google.gson.JsonElement; import com.google.gson.JsonParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; import scala.Tuple2; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index 29d72ed0b..d1b2a1d73 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; @@ -56,6 +56,7 @@ public class SummariesDecompressor { int nameFound = 0; int surnameFound = 0; int creditNameFound = 0; + int otherNamesFound = 0; int errorFromOrcidFound = 0; int xmlParserErrorFound = 0; try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { @@ -117,6 +118,9 @@ public class SummariesDecompressor { if (authorData.getCreditName() != null) { creditNameFound += 1; } + if (authorData.getOtherNames() != null && authorData.getOtherNames().size() > 1) { + otherNamesFound += authorData.getOtherNames().size(); + } } else { Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); @@ -152,6 +156,7 @@ public class SummariesDecompressor { Log.info("Name found: " + nameFound); Log.info("Surname found: " + surnameFound); Log.info("Credit name found: " + creditNameFound); + Log.info("Other names found: " + otherNamesFound); Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error parsing xml record found: " + xmlParserErrorFound); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index 2e43f4d3e..a807cf132 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -14,7 +14,7 @@ import com.ximpleware.VTDNav; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; public class XMLRecordParser { @@ -81,6 +81,12 @@ public class XMLRecordParser { if (!creditNames.isEmpty()) { authorData.setCreditName(creditNames.get(0)); } + + final List otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content"); + if (!otherNames.isEmpty()) { + authorData.setOtherNames(otherNames); + } + return authorData; } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 7f715fa7d..cc65b0b4f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -1,18 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; -import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.util.Objects; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; @@ -26,14 +20,22 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.orcid.AuthorData; +import eu.dnetlib.doiboost.orcid.json.JsonHelper; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; -import java.io.IOException; -import java.util.Objects; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** * This spark job generates one parquet file, containing orcid publications dataset */ diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java index 363cb13e6..982fb6316 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java @@ -1,9 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi.json; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.JsonObject; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; /** @@ -12,15 +15,11 @@ import eu.dnetlib.doiboost.orcid.model.WorkData; public class JsonWriter { - public static String create(AuthorData authorData) { - JsonObject author = new JsonObject(); - author.addProperty("oid", authorData.getOid()); - author.addProperty("name", authorData.getName()); - author.addProperty("surname", authorData.getSurname()); - if (authorData.getCreditName() != null) { - author.addProperty("creditname", authorData.getCreditName()); - } - return author.toString(); + public static final com.fasterxml.jackson.databind.ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .setSerializationInclusion(JsonInclude.Include.NON_NULL);; + + public static String create(AuthorData authorData) throws JsonProcessingException { + return OBJECT_MAPPER.writeValueAsString(authorData); } public static String create(WorkData workData) { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java index 9a8651c85..9222c1cc4 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -3,7 +3,7 @@ package eu.dnetlib.doiboost.orcidnodoi.model; import java.io.Serializable; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; /** * This class models the data related to a contributor, that are retrieved from an orcid publication diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 2f86820fb..c0f617868 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -18,7 +18,7 @@ import com.ximpleware.XPathEvalException; import com.ximpleware.XPathParseException; import eu.dnetlib.dhp.parser.utility.VtdException; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; @@ -47,7 +47,7 @@ public class AuthorMatcher { .forEach(c -> { if (simpleMatch(c.getCreditName(), author.getName()) || simpleMatch(c.getCreditName(), author.getSurname()) || - simpleMatch(c.getCreditName(), author.getOtherName())) { + simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) { matchCounters.set(0, matchCounters.get(0) + 1); c.setSimpleMatch(true); } @@ -91,6 +91,13 @@ public class AuthorMatcher { } + public static boolean simpleMatchOnOtherNames(String name, List otherNames) { + if (otherNames == null || (otherNames != null && otherNames.isEmpty())) { + return false; + } + return otherNames.stream().filter(o -> simpleMatch(name, o)).count() > 0; + } + public static boolean simpleMatch(String name, String searchValue) { if (searchValue == null) { return false; diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml index e1829e847..191654378 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml @@ -21,6 +21,6 @@ oozie.launcher.mapreduce.map.java.opts - -Xmx16g + -Xmx8g \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index 5bf6f27b9..b7be5e5cd 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -2,12 +2,14 @@ package eu.dnetlib.doiboost.orcid.xml; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; public class XMLRecordParserTest { @@ -55,4 +57,21 @@ public class XMLRecordParserTest { assertNotNull(workData.getDoi()); System.out.println("doi: " + workData.getDoi()); } + + @Test + public void testOrcidOtherNamesXMLParser() throws Exception { + + String xml = IOUtils + .toString( + this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml")); + + XMLRecordParser p = new XMLRecordParser(); + + AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes()); + assertNotNull(authorData); + assertNotNull(authorData.getOtherNames()); + assertTrue(authorData.getOtherNames().get(0).equals("Andrew C. Porteus")); + String jsonData = JsonWriter.create(authorData); + assertNotNull(jsonData); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index c2c4ed5e1..948e5b094 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -5,34 +5,24 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; -import java.text.Normalizer; import java.util.*; -import javax.validation.constraints.AssertTrue; - import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.text.similarity.JaccardSimilarity; -import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.junit.jupiter.api.Test; -import org.mortbay.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; import com.ximpleware.NavException; import com.ximpleware.ParseException; import com.ximpleware.XPathEvalException; import com.ximpleware.XPathParseException; import eu.dnetlib.dhp.parser.utility.VtdException; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; -import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidNoDoiTest { diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml new file mode 100644 index 000000000..7abc2f35a --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml @@ -0,0 +1,770 @@ + + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + zh_CN + + + API + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + true + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + 2001-12-31T12:00:00 + true + true + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + give-names + family-name + credit-name + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + other-name-1 + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + biography + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + url-name-1 + http://url.com/ + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + user1@email.com + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + US + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + keyword1 + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + type-1 + value-1 + http://url.com/1 + self + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + distinction:department-name + distinction:role-title + + 1948 + 02 + 02 + + + 1948 + 02 + 02 + + + distinction-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-distinction + GRID + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + education:department-name + education:role-title + + 1948 + 02 + 02 + + + 2019 + 01 + 01 + + + education-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-education + GRID + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + employment:department-name + employment:role-title + + 1948 + 02 + 02 + + + 2025 + + + employment-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-employment + GRID + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + grant_number + external-id-value-1 + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + + common:title + common:translated-title + + + + grant_number + external-id-value-1 + http://tempuri.org + self + + + grant + + 1948 + 02 + 02 + + + 1948 + 02 + 02 + + + common:name + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-funding + FUNDREF + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + invited-position:department-name + invited-position:role-title + + 2019 + 01 + 01 + + + 2025 + 01 + 01 + + + invited-position-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-invited-position + GRID + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + membership:department-name + membership:role-title + + 1948 + 02 + 02 + + + membership-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-membership + RINGGOLD + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + something + external-id-value + http://orcid.org + self + + + + 2001-12-31T12:00:00 + + + something + external-id-value + + http://orcid.org + self + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/client/APP-9999999999999901 + APP-9999999999999901 + orcid.org + + + + reviewer + + + something + external-id-value + http://orcid.org + self + + + http://orcid.org + review + + 1948 + 02 + 02 + + orcid-generated:12345 + + common:name + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-peer-review + RINGGOLD + + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + qualification:department-name + qualification:role-title + + 1948 + 02 + 02 + + + 2025 + 12 + + + qualification-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-qualification + RINGGOLD + + + + + + + 2017-01-18T15:06:05.147-06:00 + + 2017-01-18T15:03:56.856-06:00 + + + proposal_id + 123456 + self + + + + + 2015-06-25T16:01:12.718Z + 2017-09-08T13:31:19.987Z + + + + https://orcid.org/0000-0000-0000-0000 + 0000-0000-0000-0000 + orcid.org + + XSEDE ORCID integration + + + + + Giant Laser Award + + + + XSEDE + + city + region + US + + + XX + grid + + + + + + proposal_id + 123456 + self + + + + 1999 + 02 + 02 + + + 2012 + 02 + 02 + + http://xsede.org/GiantLaserAward + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + + agr + external-id-value + + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + service:department-name + service:role-title + + 1948 + 02 + 02 + + + service-org + + common:city + common:region + AF + + + common:disambiguated-organization-identifier-service + RINGGOLD + + + + + + + 2001-12-31T12:00:00 + + 2001-12-31T12:00:00 + + + agr + external-id-value + http://orcid.org + part-of + + + + 2001-12-31T12:00:00 + 2001-12-31T12:00:00 + + + https://orcid.org/client/8888-8888-8888-8880 + 8888-8888-8888-8880 + orcid.org + + + + + common:title + + common:translated-title + + + + agr + external-id-value + http://tempuri.org + self + + + artistic-performance + + 1948 + 02 + 02 + + Procedia Computer Science + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml new file mode 100644 index 000000000..43bc96b8c --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml @@ -0,0 +1,196 @@ + + + + https://orcid.org/0000-0001-5109-1000 + 0000-0001-5109-1000 + orcid.org + + + en + + + Member-referred + 2019-05-01T13:04:57.507Z + 2019-05-01T13:59:54.268Z + true + true + true + + + 2019-05-01T13:45:47.727Z + + 2019-05-01T13:04:57.507Z + 2019-05-01T13:04:57.740Z + Andrew + Porteus + + + 2019-05-01T13:44:57.072Z + + 2019-05-01T13:44:57.072Z + 2019-05-01T13:44:57.072Z + + + https://orcid.org/0000-0001-5109-1000 + 0000-0001-5109-1000 + orcid.org + + Andrew Porteus + + Andrew C. Porteus + + + + 2019-05-01T13:59:54.263Z + 2019-05-01T13:59:54.263Z + Retired Librarian + + + 2019-05-01T13:45:47.727Z + + 2019-05-01T13:45:47.727Z + 2019-05-01T13:45:47.727Z + + + https://orcid.org/0000-0001-5109-1000 + 0000-0001-5109-1000 + orcid.org + + Andrew Porteus + + Niagara Falls Poetry Project + http://niagarapoetry.ca + + + + + 2019-05-01T13:45:09.764Z + + 2019-05-01T13:45:09.764Z + 2019-05-01T13:45:09.764Z + + + https://orcid.org/0000-0001-5109-1000 + 0000-0001-5109-1000 + orcid.org + + Andrew Porteus + + CA + + + + + + + 2019-05-01T13:57:45.787Z + + + 2019-05-01T13:15:26.102Z + + 2019-05-01T13:15:26.102Z + + + 2019-05-01T13:15:26.102Z + 2019-05-01T13:15:26.102Z + + + https://orcid.org/0000-0001-5109-1000 + 0000-0001-5109-1000 + orcid.org + + Andrew Porteus + + Library Technician Diploma + + 1976 + 09 + + + 1978 + 05 + + + Niagara College + + Welland + ON + CA + + + 125147 + RINGGOLD + + + + + + + + + + + + 2019-05-01T13:19:49.021Z + + 2019-05-01T13:19:49.021Z + + + 2019-05-01T13:19:49.021Z + 2019-05-01T13:19:49.021Z + + + https://orcid.org/0000-0001-5109-1000 + 0000-0001-5109-1000 + orcid.org + + Andrew Porteus + + Communication, Film & Popular Culture + Master's Candidate + + 2018 + 09 + + + Brock University + + Saint Catharines + ON + CA + + + 7497 + RINGGOLD + + + + + + + \ No newline at end of file From 9a2fa9dc2f45f030ca358621ddbbd51a4be3bf2c Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 13 Nov 2020 10:25:34 +0100 Subject: [PATCH 28/34] added test for other names parsing from summaries dump --- .../orcidnodoi/xml/OrcidNoDoiTest.java | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index 948e5b094..1f77197ab 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -13,6 +13,7 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.Lists; import com.ximpleware.NavException; import com.ximpleware.ParseException; import com.ximpleware.XPathEvalException; @@ -218,7 +219,7 @@ public class OrcidNoDoiTest { .forEach(c -> { if (am.simpleMatch(c.getCreditName(), author.getName()) || am.simpleMatch(c.getCreditName(), author.getSurname()) || - am.simpleMatch(c.getCreditName(), author.getOtherName())) { + am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) { matchCounters.set(0, matchCounters.get(0) + 1); c.setSimpleMatch(true); } @@ -250,7 +251,7 @@ public class OrcidNoDoiTest { .forEach(c -> { if (am.simpleMatch(c.getCreditName(), authorX.getName()) || am.simpleMatch(c.getCreditName(), authorX.getSurname()) || - am.simpleMatch(c.getCreditName(), authorX.getOtherName())) { + am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) { int currentCounter = matchCounters2.get(0); currentCounter += 1; matchCounters2.set(0, currentCounter); @@ -321,4 +322,27 @@ public class OrcidNoDoiTest { assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye")); assertTrue(c.get(0).getOid().equals(orcidIdA)); } + + @Test + public void otherNamesMatchTest() + throws VtdException, ParseException, IOException, XPathEvalException, NavException, XPathParseException { + + AuthorData author = new AuthorData(); + author.setName("Joe"); + author.setSurname("Dodge"); + author.setOid("0000-1111-2222-3333"); + String otherName1 = new String("Joe Dr. Dodge"); + String otherName2 = new String("XY"); + List others = Lists.newArrayList(); + others.add(otherName1); + others.add(otherName2); + author.setOtherNames(others); + Contributor contributor = new Contributor(); + contributor.setCreditName("XY"); + List contributors = Arrays.asList(contributor); + AuthorMatcher.match(author, contributors); + assertTrue(contributors.get(0).getName().equals("Joe")); + assertTrue(contributors.get(0).getSurname().equals("Dodge")); + assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333")); + } } From 005f849674c93f44e9a3e66b86211dd8f38f8919 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 13 Nov 2020 12:45:31 +0100 Subject: [PATCH 29/34] added compression to output dataset --- .../dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index cc65b0b4f..a92d534d8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -128,6 +128,8 @@ public class SparkGenEnrichedOrcidWorks { }) .filter(p -> p != null); + sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true"); + oafPublicationRDD .mapToPair( p -> new Tuple2<>(p.getClass().toString(), From c0c2e05eae56c3dad6e111177d88f1959b654d2e Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 17 Nov 2020 18:23:12 +0100 Subject: [PATCH 30/34] added wf to extracting authors and works xml data from orcid dump to hdfs; added wf to download the lamda file (containing last orcid update informations) from orcid to hdfs --- .../orcid/ActivitiesDecompressor.java | 61 +++++ .../orcid/ExtractXMLActivitiesData.java | 54 ++++ .../orcid/ExtractXMLSummariesData.java | 56 +++++ .../doiboost/orcid/SummariesDecompressor.java | 64 +++++ .../doiboost/orcid/xml/XMLRecordParser.java | 31 +++ .../orcid_download/oozie_app/workflow.xml | 45 ---- .../oozie_app/workflow.xml | 232 ++++++++++++++++++ .../oozie_app/config-default.xml | 26 ++ .../oozie_app/workflow.xml | 40 +++ .../oozie_app/config-default.xml | 0 .../oozie_app/workflow.xml | 64 +++++ 11 files changed, 628 insertions(+), 45 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{orcid_download => orcid_updates_download}/oozie_app/config-default.xml (100%) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 02d2b267b..420c363ec 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.io.compress.GzipCodec; import org.mortbay.log.Log; import eu.dnetlib.doiboost.orcid.model.WorkData; @@ -143,4 +144,64 @@ public class ActivitiesDecompressor { Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error parsing xml work found: " + xmlParserErrorFound); } + + public static void extractXML(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + int counter = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + if (entry.isDirectory() || !filename.contains("works")) { + } else { + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + String xml = buffer.toString(); + String[] filenameParts = filename.split("/"); + final Text key = new Text( + XMLRecordParser + .retrieveOrcidIdFromActivity( + xml.getBytes(), filenameParts[filenameParts.length - 1])); + final Text value = new Text(xml); + writer.append(key, value); + if ((counter % 100000) == 0) { + Log.info("Current xml works extracted: " + counter); + } + } + } + } + } + Log.info("Activities extraction completed"); + Log.info("Total XML works parsed: " + counter); + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } + } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java new file mode 100644 index 000000000..c834efa20 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.doiboost.orcid; + +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork; + +public class ExtractXMLActivitiesData extends OrcidDSManager { + private String outputWorksPath; + private String activitiesFileNameTarGz; + + public static void main(String[] args) throws IOException, Exception { + ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData(); + extractXMLActivitiesData.loadArgs(args); + extractXMLActivitiesData.extractWorks(); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenOrcidAuthorWork.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); + activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); + Log.info("Activities File Name: " + activitiesFileNameTarGz); + outputWorksPath = parser.get("outputWorksPath"); + Log.info("Output Author Work Data: " + outputWorksPath); + } + + private void extractWorks() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path( + hdfsServerUri + .concat(workingPath) + .concat(outputWorksPath)); + ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java new file mode 100644 index 000000000..843889108 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.doiboost.orcid; + +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork; + +public class ExtractXMLSummariesData extends OrcidDSManager { + + private String outputAuthorsPath; + private String summariesFileNameTarGz; + + public static void main(String[] args) throws IOException, Exception { + ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData(); + extractXMLSummariesData.loadArgs(args); + extractXMLSummariesData.extractAuthors(); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenOrcidAuthorWork.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); + summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); + Log.info("Summaries File Name: " + summariesFileNameTarGz); + outputAuthorsPath = parser.get("outputAuthorsPath"); + Log.info("Output Authors Data: " + outputAuthorsPath); + } + + public void extractAuthors() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz); + Path outputPath = new Path( + hdfsServerUri + .concat(workingPath) + .concat(outputAuthorsPath) + .concat("xml_authors.seq")); + SummariesDecompressor.extractXML(conf, tarGzUri, outputPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index d1b2a1d73..c16899977 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.io.compress.GzipCodec; import org.mortbay.log.Log; import eu.dnetlib.dhp.schema.orcid.AuthorData; @@ -160,4 +161,67 @@ public class SummariesDecompressor { Log.info("Error from Orcid found: " + errorFromOrcidFound); Log.info("Error parsing xml record found: " + xmlParserErrorFound); } + + public static void extractXML(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + int counter = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + CompressionCodec Codec = new GzipCodec(); + org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer + .compression(SequenceFile.CompressionType.RECORD, Codec); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), optCom)) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + if (entry.isDirectory()) { + Log.debug("Directory entry name: " + entry.getName()); + } else { + Log.debug("XML record entry name: " + entry.getName()); + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + String xml = buffer.toString(); + final Text key = new Text( + XMLRecordParser + .retrieveOrcidIdFromSummary( + xml.getBytes(), filename.split("/")[2].substring(0, 19))); + final Text value = new Text(xml); + writer.append(key, value); + } + if ((counter % 100000) == 0) { + Log.info("Current xml records extracted: " + counter); + } + } + } + } + Log.info("Summaries extract completed"); + Log.info("Total XML records parsed: " + counter); + + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index a807cf132..cc9abb621 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml; import java.util.Arrays; import java.util.List; +import org.mortbay.log.Log; + import com.ximpleware.AutoPilot; import com.ximpleware.EOFException; import com.ximpleware.EncodingException; @@ -126,4 +128,33 @@ public class XMLRecordParser { } return workData; } + + public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue) + throws VtdException, ParseException { + return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1); + } + + public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue) + throws VtdException, ParseException { + return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code"); + } + + private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath, + String idAttributeName) + throws VtdException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(ns, nsUrl); + List recordNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, xpath, Arrays.asList(idAttributeName)); + if (!recordNodes.isEmpty()) { + return (recordNodes.get(0).getAttributes().get(idAttributeName)); + } + Log.info("id not found - default: " + defaultValue); + return defaultValue; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml deleted file mode 100644 index 1f9adeb4d..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - workingPathOrcid - the working dir base path - - - token - access token - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidDownloader - -d${workingPathOrcid}/ - -n${nameNode} - -flast_modified.csv - -odownload/ - -t${token} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml new file mode 100644 index 000000000..6f629c754 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml @@ -0,0 +1,232 @@ + + + + workingPath + the working dir base path + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.java + ${oozieActionShareLibForSpark2} + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx2g + + + oozie.use.system.libpath + true + + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_0.tar.gz + -owxml/works/xml_works_0.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_1.tar.gz + -owxml/works/xml_works_1.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_2.tar.gz + -owxml/works/xml_works_2.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_3.tar.gz + -owxml/works/xml_works_3.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_4.tar.gz + -owxml/works/xml_works_4.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_5.tar.gz + -owxml/works/xml_works_5.seq + -oew--- + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_6.tar.gz + -owxml/works/xml_works_6.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_7.tar.gz + -owxml/works/xml_works_7.seq + -oew--- + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_8.tar.gz + -owxml/works/xml_works_8.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_9.tar.gz + -owxml/works/xml_works_9.seq + -oew--- + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_activites_X.tar.gz + -owxml/works/xml_works_X.seq + -oew--- + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml new file mode 100644 index 000000000..191654378 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml @@ -0,0 +1,26 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx8g + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml new file mode 100644 index 000000000..68d468ab3 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml @@ -0,0 +1,40 @@ + + + + workingPath + the working dir base path + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData + -w${workingPath}/ + -n${nameNode} + -fORCID_2020_10_summaries.tar.gz + -oxml/authors/ + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/config-default.xml rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml new file mode 100644 index 000000000..a3daab116 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml @@ -0,0 +1,64 @@ + + + + workingPath + the working dir base path + + + token + access token + + + shell_cmd + wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar + + the shell command that downloads the lambda file from orcid containing last orcid update informations + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidDownloader + -d${workingPathOrcid}/ + -n${nameNode} + -flast_modified.csv + -odownload/ + -t${token} + + + + + + + \ No newline at end of file From 97c8111847a148fb738c593136d16934c6be15cf Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 23 Nov 2020 09:49:22 +0100 Subject: [PATCH 31/34] action to convert lambda file in seq file; spark action to download updated authors --- .../doiboost/orcid/OrcidDownloader.java | 185 +++++++++--------- .../orcid/SparkDownloadOrcidAuthors.java | 166 ++++++++++++++++ .../orcid/SparkGenLastModifiedSeq.java | 99 ++++++++++ .../orcid/SparkOrcidGenerateAuthors.java | 165 ---------------- .../orcid/SparkPartitionLambdaFile.java | 50 ----- .../orcid/model/DownloadedRecordData.java | 14 +- .../gen_orcid_authors_parameters.json | 4 - .../oozie_app/config-default.xml | 22 --- .../orcid_gen_authors/oozie_app/workflow.xml | 83 -------- .../oozie_app/workflow.xml | 122 +++++++++++- .../doiboost/orcid/OrcidClientTest.java | 139 +++++++++++-- .../0000-0001-6645-509X.compressed.base64 | 1 - .../0000-0003-3028-6161.compressed.base64 | 1 + 13 files changed, 608 insertions(+), 443 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64 diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java index 762d8aecd..be727ab9f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java @@ -1,14 +1,15 @@ package eu.dnetlib.doiboost.orcid; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.*; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.List; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -16,6 +17,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; @@ -27,10 +29,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; public class OrcidDownloader extends OrcidDSManager { static final int REQ_LIMIT = 24; -// static final int REQ_MAX_TEST = 100; - static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000; + static final int REQ_MAX_TEST = -1; + static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500; static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; - static final String lastUpdate = "2019-09-30 00:00:00"; + static final String lastUpdate = "2020-09-29 00:00:00"; private String lambdaFileName; private String outputPath; private String token; @@ -41,7 +43,7 @@ public class OrcidDownloader extends OrcidDSManager { orcidDownloader.parseLambdaFile(); } - private String downloadRecord(String orcidId) { + private String downloadRecord(String orcidId) throws IOException { try (CloseableHttpClient client = HttpClients.createDefault()) { HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); httpGet.addHeader("Accept", "application/vnd.orcid+xml"); @@ -49,17 +51,23 @@ public class OrcidDownloader extends OrcidDSManager { CloseableHttpResponse response = client.execute(httpGet); if (response.getStatusLine().getStatusCode() != 200) { Log - .warn( + .info( "Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode()); return new String(""); } - return IOUtils.toString(response.getEntity().getContent()); - - } catch (Throwable e) { - Log.warn("Downloading " + orcidId, e.getMessage()); - +// return IOUtils.toString(response.getEntity().getContent()); + return xmlStreamToString(response.getEntity().getContent()); } - return new String(""); + } + + private String xmlStreamToString(InputStream xmlStream) throws IOException { + BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream)); + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + return buffer.toString(); } public void parseLambdaFile() throws Exception { @@ -76,90 +84,87 @@ public class OrcidDownloader extends OrcidDSManager { hdfsServerUri .concat(workingPath) .concat(outputPath) - .concat("orcid_records.seq")); - - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(hdfsoutputPath), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class))) { - - try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) { - String line; - int nReqTmp = 0; + .concat("updated_xml_authors.seq")); + try (TarArchiveInputStream tais = new TarArchiveInputStream( + new GzipCompressorInputStream(lambdaFileStream))) { + TarArchiveEntry entry = null; + StringBuilder sb = new StringBuilder(); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfsoutputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { startDownload = System.currentTimeMillis(); - long startReqTmp = System.currentTimeMillis(); - while ((line = br.readLine()) != null) { - parsedRecordsCounter++; - // skip headers line - if (parsedRecordsCounter == 1) { - continue; - } - String[] values = line.split(","); - List recordInfo = Arrays.asList(values); - String orcidId = recordInfo.get(0); - if (isModified(orcidId, recordInfo.get(3))) { - String record = downloadRecord(orcidId); - downloadedRecordsCounter++; - if (!record.isEmpty()) { - String compressRecord = ArgumentApplicationParser.compressArgument(record); - final Text key = new Text(recordInfo.get(0)); - final Text value = new Text(compressRecord); - - try { + while ((entry = tais.getNextTarEntry()) != null) { + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput + String line; + while ((line = br.readLine()) != null) { + String[] values = line.split(","); + List recordInfo = Arrays.asList(values); + int nReqTmp = 0; + long startReqTmp = System.currentTimeMillis(); + // skip headers line + if (parsedRecordsCounter == 0) { + parsedRecordsCounter++; + continue; + } + parsedRecordsCounter++; + String orcidId = recordInfo.get(0); + if (isModified(orcidId, recordInfo.get(3))) { + String record = downloadRecord(orcidId); + downloadedRecordsCounter++; + if (!record.isEmpty()) { +// String compressRecord = ArgumentApplicationParser.compressArgument(record); + final Text key = new Text(recordInfo.get(0)); + final Text value = new Text(record); writer.append(key, value); savedRecordsCounter++; - } catch (IOException e) { - Log.warn("Writing to sequence file: " + e.getMessage()); - Log.warn(e); - throw new RuntimeException(e); + } + } else { + break; + } + long endReq = System.currentTimeMillis(); + nReqTmp++; + if (nReqTmp == REQ_LIMIT) { + long reqSessionDuration = endReq - startReqTmp; + if (reqSessionDuration <= 1000) { + Log + .info( + "\nreqSessionDuration: " + + reqSessionDuration + + " nReqTmp: " + + nReqTmp + + " wait ...."); + Thread.sleep(1000 - reqSessionDuration); + } else { + nReqTmp = 0; + startReqTmp = System.currentTimeMillis(); + } + } + if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) { + Log + .info( + "Current parsed: " + + parsedRecordsCounter + + " downloaded: " + + downloadedRecordsCounter + + " saved: " + + savedRecordsCounter); + if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) { + break; } } } - long endReq = System.currentTimeMillis(); - nReqTmp++; - if (nReqTmp == REQ_LIMIT) { - long reqSessionDuration = endReq - startReqTmp; - if (reqSessionDuration <= 1000) { - Log - .warn( - "\nreqSessionDuration: " - + reqSessionDuration - + " nReqTmp: " - + nReqTmp - + " wait ...."); - Thread.sleep(1000 - reqSessionDuration); - } else { - nReqTmp = 0; - startReqTmp = System.currentTimeMillis(); - } - } - -// if (parsedRecordsCounter > REQ_MAX_TEST) { -// break; -// } - if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) { - Log - .info( - "Current parsed: " - + parsedRecordsCounter - + " downloaded: " - + downloadedRecordsCounter - + " saved: " - + savedRecordsCounter); -// if (parsedRecordsCounter > REQ_MAX_TEST) { -// break; -// } - } + long endDownload = System.currentTimeMillis(); + long downloadTime = endDownload - startDownload; + Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes"); } - long endDownload = System.currentTimeMillis(); - long downloadTime = endDownload - startDownload; - Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes"); } } - lambdaFileStream.close(); Log.info("Download started at: " + new Date(startDownload).toString()); + Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString()); Log.info("Parsed Records Counter: " + parsedRecordsCounter); Log.info("Downloaded Records Counter: " + downloadedRecordsCounter); Log.info("Saved Records Counter: " + savedRecordsCounter); @@ -185,7 +190,7 @@ public class OrcidDownloader extends OrcidDSManager { token = parser.get("token"); } - private boolean isModified(String orcidId, String modifiedDate) { + public boolean isModified(String orcidId, String modifiedDate) { Date modifiedDateDt = null; Date lastUpdateDt = null; try { @@ -195,7 +200,7 @@ public class OrcidDownloader extends OrcidDSManager { modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate); lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); } catch (Exception e) { - Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage()); + Log.info("[" + orcidId + "] Parsing date: ", e.getMessage()); return true; } return modifiedDateDt.after(lastUpdateDt); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java new file mode 100644 index 000000000..850a654d4 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -0,0 +1,166 @@ + +package eu.dnetlib.doiboost.orcid; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.List; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.util.LongAccumulator; +import org.mortbay.log.Log; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData; +import scala.Tuple2; + +public class SparkDownloadOrcidAuthors { + + static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class); + static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; + static final String lastUpdate = "2020-09-29 00:00:00"; + + public static void main(String[] args) throws IOException, Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkDownloadOrcidAuthors.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/download_orcid_data.json"))); + parser.parseArgument(args); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + final String workingPath = parser.get("workingPath"); + logger.info("workingPath: ", workingPath); + final String outputPath = parser.get("outputPath"); + logger.info("outputPath: ", outputPath); + final String token = parser.get("token"); + final String lambdaFileName = parser.get("lambdaFileName"); + logger.info("lambdaFileName: ", lambdaFileName); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsedRecords"); + LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("modifiedRecords"); + LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloadedRecords"); + + logger.info("Retrieving data from lamda sequence file"); + JavaPairRDD lamdaFileRDD = sc + .sequenceFile(workingPath + lambdaFileName, Text.class, Text.class); + logger.info("Data retrieved: " + lamdaFileRDD.count()); + + Function, Boolean> isModifiedAfterFilter = data -> { + String orcidId = data._1().toString(); + String lastModifiedDate = data._2().toString(); + parsedRecordsAcc.add(1); + if (isModified(orcidId, lastModifiedDate)) { + modifiedRecordsAcc.add(1); + return true; + } + return false; + }; + + Function, Tuple2> downloadRecordFunction = data -> { + String orcidId = data._1().toString(); + String lastModifiedDate = data._2().toString(); + final DownloadedRecordData downloaded = new DownloadedRecordData(); + downloaded.setOrcidId(orcidId); + downloaded.setLastModifiedDate(lastModifiedDate); + try (CloseableHttpClient client = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); + httpGet.addHeader("Accept", "application/vnd.orcid+xml"); + httpGet.addHeader("Authorization", String.format("Bearer %s", token)); + CloseableHttpResponse response = client.execute(httpGet); + int statusCode = response.getStatusLine().getStatusCode(); + downloaded.setStatusCode(statusCode); + if (statusCode != 200) { + logger + .info( + "Downloading " + orcidId + " status code: " + + response.getStatusLine().getStatusCode()); + return downloaded.toTuple2(); + } + downloadedRecordsAcc.add(1); + long currentDownloaded = downloadedRecordsAcc.value(); + if ((currentDownloaded % 10000) == 0) { + logger.info("Current downloaded: " + currentDownloaded); + } + downloaded + .setCompressedData( + ArgumentApplicationParser + .compressArgument(IOUtils.toString(response.getEntity().getContent()))); + } catch (Throwable e) { + logger.info("Downloading " + orcidId, e.getMessage()); + downloaded.setErrorMessage(e.getMessage()); + return downloaded.toTuple2(); + } + return downloaded.toTuple2(); + }; + + sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true"); + + logger.info("Start execution ..."); +// List> sampleList = lamdaFileRDD.take(500); +// JavaRDD> sampleRDD = sc.parallelize(sampleList); +// sampleRDD + JavaPairRDD authorsModifiedRDD = lamdaFileRDD + .filter(isModifiedAfterFilter); + logger.info("Authors modified count: " + authorsModifiedRDD.count()); + logger.info("Start downloading ..."); + authorsModifiedRDD + .map(downloadRecordFunction) + .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) + .saveAsNewAPIHadoopFile( + workingPath.concat(outputPath), + Text.class, + Text.class, + SequenceFileOutputFormat.class, + sc.hadoopConfiguration()); + logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString()); + logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString()); + logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString()); + }); + + } + + private static boolean isModified(String orcidId, String modifiedDate) { + Date modifiedDateDt = null; + Date lastUpdateDt = null; + try { + if (modifiedDate.length() != 19) { + modifiedDate = modifiedDate.substring(0, 19); + } + modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate); + lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); + } catch (Exception e) { + logger.info("[" + orcidId + "] Parsing date: ", e.getMessage()); + return true; + } + return modifiedDateDt.after(lastUpdateDt); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java new file mode 100644 index 000000000..f710635ab --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java @@ -0,0 +1,99 @@ + +package eu.dnetlib.doiboost.orcid; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.mortbay.log.Log; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class SparkGenLastModifiedSeq { + private static String hdfsServerUri; + private static String workingPath; + private static String outputPath; + private static String lambdaFileName; + + public static void main(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenLastModifiedSeq.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/download_orcid_data.json"))); + parser.parseArgument(args); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + hdfsServerUri = parser.get("hdfsServerUri"); + workingPath = parser.get("workingPath"); + outputPath = parser.get("outputPath"); + lambdaFileName = parser.get("lambdaFileName"); + String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName); + + SparkConf sparkConf = new SparkConf(); + runWithSparkSession( + sparkConf, + isSparkSessionManaged, + spark -> { + int rowsNum = 0; + Path output = new Path( + hdfsServerUri + .concat(workingPath) + .concat(outputPath)); + Path hdfsreadpath = new Path(lambdaFileUri); + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath)); + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf); + FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath); + try (TarArchiveInputStream tais = new TarArchiveInputStream( + new GzipCompressorInputStream(lambdaFileStream))) { + TarArchiveEntry entry = null; + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(output), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { + while ((entry = tais.getNextTarEntry()) != null) { + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); + String line; + while ((line = br.readLine()) != null) { + String[] values = line.split(","); + List recordInfo = Arrays.asList(values); + String orcidId = recordInfo.get(0); + final Text key = new Text(orcidId); + final Text value = new Text(recordInfo.get(3)); + writer.append(key, value); + rowsNum++; + } + } + } + } + Log.info("Saved rows from lamda csv tar file: " + rowsNum); + }); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java deleted file mode 100644 index 4e18ab840..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java +++ /dev/null @@ -1,165 +0,0 @@ - -package eu.dnetlib.doiboost.orcid; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.Text; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.util.LongAccumulator; -import org.mortbay.log.Log; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData; -import scala.Tuple2; - -public class SparkOrcidGenerateAuthors { - - static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; - static final String lastUpdate = "2019-09-30 00:00:00"; - - public static void main(String[] args) throws IOException, Exception { - Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class); - logger.info("[ SparkOrcidGenerateAuthors STARTED]"); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkOrcidGenerateAuthors.class - .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json"))); - parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - logger.info("workingPath: ", workingPath); - final String outputAuthorsPath = parser.get("outputAuthorsPath"); - logger.info("outputAuthorsPath: ", outputAuthorsPath); - final String token = parser.get("token"); - - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords"); - LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords"); - LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords"); - LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords"); - JavaRDD lamdaFileRDD = sc.textFile(workingPath + "lamdafiles"); - - JavaRDD downloadedRDD = sc.textFile(workingPath + "downloaded"); - Function getOrcidIdFunction = line -> { - try { - String[] values = line.split(","); - return values[0].substring(1); - } catch (Exception e) { - return new String(""); - } - }; - List downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect(); - - Function isModifiedAfterFilter = line -> { - String[] values = line.split(","); - String orcidId = values[0]; - parsedRecordsAcc.add(1); - if (isModified(orcidId, values[3])) { - modifiedRecordsAcc.add(1); - return true; - } - return false; - }; - Function isNotDownloadedFilter = line -> { - String[] values = line.split(","); - String orcidId = values[0]; - if (downloadedRecords.contains(orcidId)) { - alreadyDownloadedRecords.add(1); - return false; - } - return true; - }; - Function> downloadRecordFunction = line -> { - String[] values = line.split(","); - String orcidId = values[0]; - String modifiedDate = values[3]; - return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc); - }; - - lamdaFileRDD - .filter(isModifiedAfterFilter) - .filter(isNotDownloadedFilter) - .map(downloadRecordFunction) - .rdd() - .saveAsTextFile(workingPath.concat(outputAuthorsPath)); - }); - - } - - private static boolean isModified(String orcidId, String modifiedDate) { - Date modifiedDateDt = null; - Date lastUpdateDt = null; - try { - if (modifiedDate.length() != 19) { - modifiedDate = modifiedDate.substring(0, 19); - } - modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate); - lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); - } catch (Exception e) { - Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage()); - return true; - } - return modifiedDateDt.after(lastUpdateDt); - } - - private static Tuple2 downloadRecord(String orcidId, String modifiedDate, String token, - LongAccumulator downloadedRecordsAcc) { - final DownloadedRecordData data = new DownloadedRecordData(); - data.setOrcidId(orcidId); - data.setModifiedDate(modifiedDate); - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); - httpGet.addHeader("Accept", "application/vnd.orcid+xml"); - httpGet.addHeader("Authorization", String.format("Bearer %s", token)); - CloseableHttpResponse response = client.execute(httpGet); - int statusCode = response.getStatusLine().getStatusCode(); - data.setStatusCode(statusCode); - if (statusCode != 200) { - Log - .warn( - "Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode()); - return data.toTuple2(); - } - downloadedRecordsAcc.add(1); - data - .setCompressedData( - ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent()))); - } catch (Throwable e) { - Log.warn("Downloading " + orcidId, e.getMessage()); - data.setErrorMessage(e.getMessage()); - return data.toTuple2(); - } - return data.toTuple2(); - } -} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java deleted file mode 100644 index ca6f0f6c4..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java +++ /dev/null @@ -1,50 +0,0 @@ - -package eu.dnetlib.doiboost.orcid; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.IOException; -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -public class SparkPartitionLambdaFile { - - public static void main(String[] args) throws IOException, Exception { - Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkOrcidGenerateAuthors.class - .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json"))); - parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - final String workingPath = parser.get("workingPath"); - - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv"); - - lamdaFileRDD - .repartition(20) - .saveAsTextFile(workingPath.concat("lamdafiles")); - }); - } - -} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java index f66ef82a2..da1a79b19 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java @@ -3,8 +3,6 @@ package eu.dnetlib.doiboost.orcid.model; import java.io.Serializable; -import org.apache.hadoop.io.Text; - import com.google.gson.JsonObject; import scala.Tuple2; @@ -12,7 +10,7 @@ import scala.Tuple2; public class DownloadedRecordData implements Serializable { private String orcidId; - private String modifiedDate; + private String lastModifiedDate; private String statusCode; private String compressedData; private String errorMessage; @@ -20,7 +18,7 @@ public class DownloadedRecordData implements Serializable { public Tuple2 toTuple2() { JsonObject data = new JsonObject(); data.addProperty("statusCode", getStatusCode()); - data.addProperty("modifiedDate", getModifiedDate()); + data.addProperty("lastModifiedDate", getLastModifiedDate()); if (getCompressedData() != null) { data.addProperty("compressedData", getCompressedData()); } @@ -66,11 +64,11 @@ public class DownloadedRecordData implements Serializable { this.compressedData = compressedData; } - public String getModifiedDate() { - return modifiedDate; + public String getLastModifiedDate() { + return lastModifiedDate; } - public void setModifiedDate(String modifiedDate) { - this.modifiedDate = modifiedDate; + public void setLastModifiedDate(String lastModifiedDate) { + this.lastModifiedDate = lastModifiedDate; } } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json deleted file mode 100644 index 35bfe1b41..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json +++ /dev/null @@ -1,4 +0,0 @@ -[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true}, - {"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}, - {"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write the authors data", "paramRequired": true} -] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml deleted file mode 100644 index a720e7592..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - jobTracker - hadoop-rm3.garr-pa1.d4science.org:8032 - - - nameNode - hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 - - - queueName - default - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml deleted file mode 100644 index 7ebc5f0a0..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml +++ /dev/null @@ -1,83 +0,0 @@ - - - - workingPath - the working dir base path - - - token - access token - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - outputPath - the working dir base path - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn - cluster - Split_Lambda_File - eu.dnetlib.doiboost.orcid.SparkPartitionLambdaFile - dhp-doiboost-1.2.1-SNAPSHOT.jar - --num-executors 24 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - - -w${workingPath}/ - -oauthors/ - -t${token} - - - - - - - - ${jobTracker} - ${nameNode} - yarn - cluster - Gen_Orcid_Authors - eu.dnetlib.doiboost.orcid.SparkOrcidGenerateAuthors - dhp-doiboost-1.2.1-SNAPSHOT.jar - --num-executors 20 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - - -w${workingPath}/ - -oauthors/ - -t${token} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml index a3daab116..5f728d35b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml @@ -14,9 +14,63 @@ the shell command that downloads the lambda file from orcid containing last orcid update informations + + sparkExecutorNumber + 20 + + + sparkDriverMemory + 7G + memory for driver process + + + sparkExecutorMemory + 2G + memory for individual executor + + + sparkExecutorCores + 1 + number of cores used by single executor + + + spark2MaxExecutors + 20 + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + - - + + + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + @@ -26,6 +80,7 @@ + @@ -41,24 +96,77 @@ ${shell_cmd} - + - + ${jobTracker} ${nameNode} eu.dnetlib.doiboost.orcid.OrcidDownloader - -d${workingPathOrcid}/ + -w${workingPath}/ -n${nameNode} - -flast_modified.csv - -odownload/ + -flast_modified.csv.tar + -odownloads/ -t${token} + + + + yarn-cluster + cluster + GenLastModifiedSeq + eu.dnetlib.doiboost.orcid.SparkGenLastModifiedSeq + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + + -w${workingPath}/ + -n${nameNode} + -flast_modified.csv.tar + -olast_modified.seq + -t- + + + + + + + + yarn-cluster + cluster + DownloadOrcidAuthors + eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors + dhp-doiboost-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + + -w${workingPath}/ + -n${nameNode} + -flast_modified.seq + -odownloads/updated_authors + -t${token} + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 774475626..d6ce99f1c 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -5,17 +5,24 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.*; import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.List; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -37,12 +44,49 @@ public class OrcidClientTest { // 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record' @Test - public void downloadTest() throws Exception { - String record = testDownloadRecord("0000-0001-6163-2042"); - File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml"); + private void multipleDownloadTest() throws Exception { + int toDownload = 1; + long start = System.currentTimeMillis(); + OrcidDownloader downloader = new OrcidDownloader(); + TarArchiveInputStream input = new TarArchiveInputStream( + new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar"))); + TarArchiveEntry entry = input.getNextTarEntry(); + BufferedReader br = null; + StringBuilder sb = new StringBuilder(); + int rowNum = 0; + int entryNum = 0; + int modified = 0; + while (entry != null) { + br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput + String line; + while ((line = br.readLine()) != null) { + String[] values = line.toString().split(","); + List recordInfo = Arrays.asList(values); + String orcidId = recordInfo.get(0); + if (downloader.isModified(orcidId, recordInfo.get(3))) { + downloadTest(orcidId); + modified++; + } + rowNum++; + if (modified > toDownload) { + break; + } + } + entryNum++; + entry = input.getNextTarEntry(); + } + long end = System.currentTimeMillis(); + logToFile("start test: " + new Date(start).toString()); + logToFile("end test: " + new Date(end).toString()); + } + + @Test + private void downloadTest(String orcid) throws Exception { + String record = testDownloadRecord(orcid); + String filename = "/tmp/downloaded_".concat(orcid).concat(".xml"); + File f = new File(filename); OutputStream outStream = new FileOutputStream(f); IOUtils.write(record.getBytes(), outStream); - System.out.println("saved to tmp"); } private String testDownloadRecord(String orcidId) throws Exception { @@ -50,7 +94,9 @@ public class OrcidClientTest { HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d"); + logToFile("start connection: " + new Date(System.currentTimeMillis()).toString()); CloseableHttpResponse response = client.execute(httpGet); + logToFile("end connection: " + new Date(System.currentTimeMillis()).toString()); if (response.getStatusLine().getStatusCode() != 200) { System.out .println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode()); @@ -62,7 +108,7 @@ public class OrcidClientTest { return new String(""); } -// @Test + // @Test private void testLambdaFileParser() throws Exception { try (BufferedReader br = new BufferedReader( new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) { @@ -108,7 +154,7 @@ public class OrcidClientTest { } } -// @Test + // @Test private void getRecordDatestamp() throws ParseException { Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate); Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate); @@ -126,7 +172,7 @@ public class OrcidClientTest { System.out.println(valueDt.toString()); } -// @Test + // @Test @Ignore private void testModifiedDate() throws ParseException { testDate(toRetrieveDate); @@ -134,14 +180,81 @@ public class OrcidClientTest { testDate(shortDate); } -// @Test - @Ignore - private void testReadBase64CompressedRecord() throws Exception { + @Test + public void testReadBase64CompressedRecord() throws Exception { final String base64CompressedRecord = IOUtils - .toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64")); + .toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); - System.out.println(recordFromSeqFile); - final String downloadedRecord = testDownloadRecord("0000-0001-6645-509X"); + logToFile("\n\ndownloaded \n\n" + recordFromSeqFile); + final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161"); assertTrue(recordFromSeqFile.equals(downloadedRecord)); } + + @Test + private void lambdaFileReaderTest() throws Exception { + TarArchiveInputStream input = new TarArchiveInputStream( + new GzipCompressorInputStream(new FileInputStream("/develop/last_modified.csv.tar"))); + TarArchiveEntry entry = input.getNextTarEntry(); + BufferedReader br = null; + StringBuilder sb = new StringBuilder(); + int rowNum = 0; + int entryNum = 0; + while (entry != null) { + br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput + String line; + while ((line = br.readLine()) != null) { + String[] values = line.toString().split(","); + List recordInfo = Arrays.asList(values); + assertTrue(recordInfo.size() == 4); + + rowNum++; + if (rowNum == 1) { + assertTrue(recordInfo.get(3).equals("last_modified")); + } else if (rowNum == 2) { + assertTrue(recordInfo.get(0).equals("0000-0002-0499-7333")); + } + } + entryNum++; + assertTrue(entryNum == 1); + entry = input.getNextTarEntry(); + } + } + + @Test + private void lambdaFileCounterTest() throws Exception { + final String lastUpdate = "2020-09-29 00:00:00"; + OrcidDownloader downloader = new OrcidDownloader(); + TarArchiveInputStream input = new TarArchiveInputStream( + new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar"))); + TarArchiveEntry entry = input.getNextTarEntry(); + BufferedReader br = null; + StringBuilder sb = new StringBuilder(); + int rowNum = 0; + int entryNum = 0; + int modified = 0; + while (entry != null) { + br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput + String line; + while ((line = br.readLine()) != null) { + String[] values = line.toString().split(","); + List recordInfo = Arrays.asList(values); + String orcidId = recordInfo.get(0); + if (downloader.isModified(orcidId, recordInfo.get(3))) { + modified++; + } + rowNum++; + } + entryNum++; + entry = input.getNextTarEntry(); + } + logToFile("rowNum: " + rowNum); + logToFile("modified: " + modified); + } + + private void logToFile(String log) + throws IOException { + log = log.concat("\n"); + Path path = Paths.get("/tmp/orcid_log.txt"); + Files.write(path, log.getBytes(), StandardOpenOption.APPEND); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64 b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64 deleted file mode 100644 index 1b088e061..000000000 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64 +++ /dev/null @@ -1 +0,0 @@ -H4sIAAAAAAAAAO1a227bOBB9z1cIepd18SW24aho0wTbAgEWjRdY9I2RaJtbSdSSkhP165eURIm6kHa2SbCLNkBiWDxzhhxyZg7tbN49xZFxhIQinFyZ7sQxDZgEOETJ/sr8Y3trLU2DZiAJQYQTeGUWkJrv/IsNgQEm4bp6MVKQHa5M22E/Fvt1rcViNrfmzupP02AOErpGSQZJAqIr85Bl6dq2Hx8fJ5gEKGR/93ZCbYEQFjDMA5CV01KZNBBhEyKaoSTQW0mgxg6mbCUgg6HGrMEIK5wdILESEEO1VYsRVjGMH1i8DyhVW7WYJhqEYKKJBB8W2ADHsS4A1bhAV1uoRlfjAp2yaWG2S1YIM4AiqrbrIwXDN1g8ah3WgGblMbPWrJwPN9in6gxZKIRJhnYI6mI2BAueXZ5UGaCyrQFNVAjcQcISB+oC0oKEHQhDAqnGpga0WXRE7ABaKaZIf8j7SMHAIvtNbcVHBfLA0gSTQg2uAe0+pREuYhZK3WYJjLD6OwcRC/2pTO/AhC2F5IgCTfLVgO7ZPXVim71hFYLFEOm2tMW02UQhIAFP+pxojm0X186QvSfwiOCjbpoNSNg95JFmV/lof36MgOKc6KI3gJr+hcF+NlX9WJdgKXmqURmRE+RzdsroW+qRLrGxJYsBDe8uvs6qBAzMDphmfuO2AZePq4XY2pVspISVM1zyJCMiHIAI+jDZ2COPa4dayk2dUSL1JEdiJCCwTAErhtkBh/5d2SiskonAcGOrgEMqmj/EiPK+b4Wsq/me464sZ2l53tadrmeLtXc58ZbLry1n32IQ8QjQzIqZeGBBDAWrx7Ztbrnu1puu59P11JksPfdrE/sRm5FlRwDFMPQzkkNpjfXTIZ4Jmoqv7A49s96gxjolKAak0LN0QfU+j+7kpiowdR3SiCZRieSTVplyIWEcEUUPKEIZK85p/hChwKzJxgRYSyJvVXk+2k0abv187rWb1EGP8o1u/QlW3dZLi24lxHqPjjAp1RT1twgkRb4Z6IwO6ATfDsQoKkqs/xmBETIZ0e6GLW2H9LgVe5I2pLqNlmCmLTF120Ovq2gZe9AOa3lEK0Gl5ag0lWxZ6xAhWPSLEqJFJqhFnVB/WnuB6c59qNbG5J5+XSN44aTZ0+qlftg2eEkPWDSPecprY9Aqg2fUyZnlTLfObD2brZ3pZHm5OLNOStOUbjfaWMi47la3XM39Sh/VBqXkaWTfiWPXwFRMte7W0giMiqMvjbVkA7CKtb2yafkkmIpJ0ndaKhmn4uroZi1bF6niG2jCs2pRi1bx1kpdyyYwKg5+edESlABFP3zplOxPbk9wnnaHX9u9zC9VPjpEKZDjQAXYyooU+iFGzfwGg8+iO4Ioh77rTFzXWdnvr69v7u8nPCYTb7X0PNcZ9VNZPctRgknMjv53GBoZAQlF5Q2Wiz2zcQ8Cdu7oafct1/PmwDp1c1FiISyvSc9dOud4llMCoyrZWTHyKYx2o7Qd1PjJGTEbOYkjqJGjuOFJWqZy22XzzApwyG6qly67kCxWjnkqy+0WOSaWWe9LI1BYKAnhE1PNpj4lelqZp+XUmjpbz1szYTt3JjP38hyt3Od9raSXfVR19/TBqHBWEPHjr8192Wr8gl+RSJuzWi5nlrtyp+P3fJ2H3t1/yNS9++uoTn4eMGpsPztAvZCWd4Rrgillt/Q+XfcCoXGsAJXZkqEsOmOLK9g9K1CR9ZFdnBN+kzdu2WnNCTTuQEbQk3HNMp3VvlIXGnflZwfGDhPjI6y+FDC+wBQyJnbHMm7Ze0iMO3yElba7JTg2biIYZATzzzXSA4jwnoDYuEd7lvK0WZRmyhv71KLOb2oK9Hnn5YWam4ryVRqcytlbNznVPF690akcv1SzK/nPangq5An99W8jpIxKXSP4Gf2LlRI+CUAyFERQZJry+DZFuOyb1eeJ6pYjWxRM95fNrJlf+UQfpPPcVOsRS6nKxKebmxvjfXl+60V1x0fUyEBn9LS7rRfvP6rt64/GVlt3vnYXa8ebLJz5T6jt53ObB8OeLl2m2WZvJurP8fviav4cpz+BjF+4znzqzd3TMr5FvryMP5GBPyjjXyC/ZR+/ZPwvGd+Rzh8IQIl1jWOWVkyDf+L/PLMDATSuDyBJYGTdQ67DuYq/ZxUwg/vC+AAoq4fsyXuWtwVF1MA74+bIA/GFlwc2+BHSIgkOBCfoe1kvjC1OuYRPD4WBSi78DRq/szGu+H/p+ddqaiovb9bYVBN4veam8vj/l+6q0PwnNbu7OkOzy3bslxf3ZWNWPThpF4LC91or/va17gefq3e83v0GQZQdAkCgcZPsUQIhQcn+DW4NnbHyqwjxxaP2S0b/YmN3/tnSv/gH9+klwrUpAAA= \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64 b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64 new file mode 100644 index 000000000..8dc3d32ad --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64 @@ -0,0 +1 @@ +H4sIAAAAAAAAAO1dW5fbthF+z6/A2XPal5biRaREqmvlrG+JG6/t4900bd+4JLSCQxIqSa1X+fUFeIUkAiIlkpFs5jRxTQ1mgAEw881gQF7/+Ox74AmGEcLBiyt1pFwBGDjYRcHji6tf799K5hWIYjtwbQ8H8MXVBkZXP85/uA6hg0N3lv4BVna8fHElK+Qfifw7lsaKZkoTdaJeASIgiGYoiGEY2N6Lq2Ucr2ay/PXr1xEOHeSS/z7KQSTnFHkL6K4dO066xWtSkORtXBTFKHDErRiioh1ckZHYMXQFzQqavBWOlzCUAtuH/FYlTd7Kh/4D0fcSrfitSppCG2GIQ4Em6M85rYN9X6SA9PecOp1CPnX6e069It3CZJYkF8Y28iJ+u13KnMPvcPNVKDAjKEbuk9aCkdOfC9rndA1JyIVBjBYIinS2T5zzWayDdAfw2mYEhVZCuIAh2ThQpJCSKG9nu24II0GbjKDcRU+ILEBphSMkXuS7lDkHotnf+a3orznlkmwTHG74xBlBOU8rD298okrRZOU0eav/rW2PqP7QTt8iy9tGMHxCjmDzZQTba/fQii3mhlgIokMkmtKSptxNEbRDh276dShYttt0ZQ/J30P4hOBXUTcLorzdw9oTzCr9dbd/hEGE16FIe3ukV/MfAPnnOrUfs4SY2TzpryzFOkRzyj0i7EvWFV7iWmZa7LGh3mUuapUQ7DVb4iieF2IL4uRxOhBZOJJrZsOyO5yRxFJ42LE9OIfBtVzxOBMoZHmd7ah86zGC8l+cECZbQPJhvMTu/DZxFFLCKYTutcwj3GcVrR98FFG/L7nEq801RdUlxZK08b2mzDR9NlZHlmX9t+S522JP454dxZJPwANRoptz1RRJVSV1eq+NZwZhrIx0TflvofuKNhXD9mzkQ3ceh2vIjDF7uk9PAE3KL/EOO812fhS0XoXIt8ONmMs2UTbPlTN5nRqYzA4JQFNuiWpqWDUlZSqpk3vVnCnaTLNGxsSqqeGsSxSggCcUoQfkoZgY/dX6wUPOVdbJKmBXMmE7mKw7pmsTSdEl1Ugm35ypxshUpmXXtqgr+VUPWMxVNGBm0CU0mT2iJxgkKC2avwwJ2sV0F4uoDjBc2D7yNgnt/PWacIwr+LFE5YzIzJQwj0sgyeDOSLSIGLIrmeG07Xp2PJaQ4w7pFtdk+adgTcgjxWtsywzj5GBIPKgcELEMMsCYI0th+5xmu+/7SLAKSorHVUHP2SNtb+ImYwCrdSyR+I74fVUxjYkyuRLs+9ojlQtmJLpaefZGQoELn4nl2NGByFaINcC3FV3rluWfIqH93/dpJMdDRD9ES9XUbItqoJQyKOZAkwzL1CTTMsfVeInHfQs/VXHZxk88Ngfx1F5DuZFCdtSX2L87B6/WEZDAGy+iiDfc5bltJavY2cSkhAkUwiF6RPQP5/g5qQ1ea03GYTDb/mQ00QdXh4naM08JcgcnJN7fUfKBLZULZ+yNFG9WxaK4WRNkG4J3rwtOe5S1eD7Z3hrO9SmZBFXVp4pSyS+lqsWQ+MY5E1RFSXdHhJBE5V/t0JXtpOevUxgwIuQ/pk/evX7BdOOvtr/6x8oO4wDSX24/mPcfbz7fVfaOiqzVtxB6SVxAc0vzCHqLSnZbVNt+psr8VzkaFtHU9a9FlMTi5OxhGWozkbkUrX0KvoWIoYzRj49Y1Jrwku0mk2cUIgeWbhsYlbyKTKcYgxRUTZAHO1zdmmnaSB2bDZAHOzOLBcERaeD5GOL1qqGjPrErnEUfyRVkha5K3ZarqcBI+tTSLGMP1ahigJQzlPPmFQhLbHB3oREbmVsUwChjvS406kPrrAwRRNqnO+SO2RYtu2SW9YlumWXV2DUnjeVGWqnCShx3fBgoHXLErEAXUo9EM7gpx1dL6BP7FW4KLrsUQnYh9qAUo9iD80/L0pEzj8VLLSaiBEuSpd2Q0JVupXJKkycH25F/6dIwi2bpg4PtXHsz14xSLfbmkPoPDKawbIFoF1YN2TxqyKp2zJDVJkMWD6VMND/aAfojMamHO5Esul8DlBxqxhuAF+C3DfZRUG5F/rpkGWWphnqb3iGi5u/t0PYRLO0yfVireQgf6eB++0+5BdIn9YTjdUC24PzXEhzmjw4bnIPDLGYXRbb/gB7Xia+pNyn12rOwUdfVSbmCajVpKj1x9amt+/zuw08/fXz/ukoiQ3ZYi02Vw5w9iEivZQFO2UXm9YFYm5htC5uY5H8j3TD+dMymVWSiGmA2rWXMtq+XEzFbC1pnZQyYbcBsR2C2l7azhB4OI+pl7xxEDzMBCsC+hA4RnaL3ieiUc0B0ynGITjGOQnTm+SG6e/hsR8COwc2aJk86R3Y7YhoDu/t/f/fATtXGY2VAdiVZ68hui67MBXIyikUp1oHj2oLs2JxiwxTqbkfbzCke0RWW/0F8WiiLUW8FQlXHmmFYk8Z5xYKnXHI4FaYeUE+LqcUjlc/KGGDqdwRTPy03EXKIlw9ccEPAJA6w30KakejKxU6MQ9sDn7OCFsI/wg4xMrAPwKpax6E3/Rj0pqgdANaT8dvH3z17iX27c+D2AYe+fQJw+/jLmQG3+vBC5IzaABglMqhGGFmNvBBe5DS8c8/dMnYhsz1iHtfyeoWQH0PG48TUYgtZsXQ8Xls17kJu25Q8fnv127Vq+0pqHt+sql7ILafh8aAXDYQMEoJG9XMWrQlVjHuFwoyZYY0svUn9HNO5o7Kgp4ln+bMo80DeoKQ8bDv3imNcjBpUzojYpbUxqjJSVd2StZGiqKqlGF9g1KSURiQhoGbcQ39AF8QEgkQouWZCK7Kv6sstmTQTntfzUGxIlJYgw9pCKytzRNJOrdApeFc0q/ITtVbdNd2Pya6tCGeMiaqa1tgSBi+0rVxSVtWc1igdZZW2m28X78BT8+2n729WRsNApkYpKduqVjmpKlmmqUuqpdYsJ2UlbMU0VZx6jmka1NqyQcurEEdRCBdtRzTJbhEED2x/UrI77NkheLW0w0di4z5DFz+Dtx7+Cl7aMdmYmxm4AfckEoI++LTEMYYedEhI5NBEu0MimZvVKsS2syz6Jwpd5EP9q++KONT1pr4ll8Rj2a5b4knpzTXxOtCde+JJbMtFJfxruSke5QGA1Fg1XHVkW5quwy9k51PpJFRGjgfz3cRdooXNTNxcGli1ny8oLW8tMel48qGkpuCf6d+S85UlBG92TMwddhCMN9l4t5tW4Io9xCCIOQ+UKBwFyLfvfOiTkTYdnzcgh8htFZBrUkQWuWQaim7qk4nFrKMO4XhNqceC8X7hcXuT20m0pVjyp3/dvRqR5T4dmePJZKKbvYVcRPjqKXIOC7+MqW4jEiLrnRgefXw4EiopO4iExKbv1EjodMPKyjiDSKjxxTpWwomX61hWf2YkdMJFO5Zl08t2bNvv4Qju2MuIXVzA2+pYy+FsEqM+YS+2kQNCGK2IfYYUWAZ2TDrreRuAHWcdhih4BBEFmDhw1wTx0b+uvQVyIfBRAAlpCc6HkLYL388T1a//5/Xizwk1Ob05cgkch+t5LNvE9jwZPeF7nviLyy9wb/qmL09LrviSVfBiX3eHLvnWSUgQ25m9SEha2SvqujvPSNSpP20hI0FNC3j35s0boOvxEmx5lbsVdBBZYVEMXhUK+DugNhHQdpebpzD1885TdBHKKupE/jKKsOfbMfUoxkixRophHGPVjvFmjeRfRkDb3oR3lZjSLd0YK5apTPtMTB2QehmT22K2wqidraguN20/W2G2d7e9BfPKyhiyFUO2YshWNNbrhWcrwC8wIlYWxRA44SaKh6REDyiOJ613JMfryJCa6BLd8WT0hPB44r/51ESpuzZSE/3XSpQvj+g0M5HWa70hqnncgFubegfiFZK7KlkpF/Sol7jUHMSk5iuad/lfcg6CTalPRtOpQZxHL5HpbjpdLPwyAtT2prq7shhlbFhT09T7mOOaUi9jclvMPmi1sw/V72RpP/vAmr72sw9NDSsrY8g+DNmHIfvQWK8Xnn0YaiXarJU4CdjxRPUL7ni9GBISXQI+noyeQB9P/DefkCh1d6G1Ej1lJMoiCPA5/SojcSHxEqb1E5zSiYipnbjcZMW0hXeEX2IEq1uqppCtofZSJlFT6mVEsO1NbtuZqMxGus8j9sba7qmGTkDFSNG0Pia+sfxTLtpXMvwOLtGrE0U3FKvG1ZGSsoN0iJFcc58mxnU607XRpP43mFi+1abboKZb05OX1hozXR0Z4/7SIUN4nTSWG2mlKv/R7dvAmgfQb+2HMMNMFPO8cu8hiZ2JEcIusMPQ3kQAP8EQePRyPXkCbYKLCPJ4XNK9RncssXrZhXoX5m8PAgscArhYIPpa2xhEyakOTA98CIrMvpk9BNz9+y+e/N59GK8jx/kxHrfLjfCS90G7iAR2cRLoZU/k7DRYXiEk3ymWNlU03VR1RZko2rhRONf/+bLRTzRX73wZtBSzlb0QH9botQ9r9L4utpqdXmzt7zNIw2HNbuPhsOYw67NBk9/LYc2ANc8Ja353yHI44ukyI8qT0VNWlCf+cgOA2jWnue4us+b0rGKCSznG2csFTvXJeR/j2OEzeuoi1083y5cHJ4XwOIgphtfMqazJ6niijhZrzxut3MXflrHvtZ/wl2AYEvixZ9nq2SnaNgkS56bCvGueed6Ajw+jyH6E81d2EOAYFH0E5TdhQDJWkMzF7CT9bUlsZDD3lPd9HHNwenEGZ2YJBpvqMl0EtyrBXvQGg97L6+5Y0f4B0cO52NHnYmbtczGz83MxazZWR4bZ3ldyWvCFrIzhXGw4F6Nkb+zQ24Abj+y97JsSeAFIJ+kHGglSXDNo8mfsQ6IeO0IR/X7jz+uAlpfGdDAognYEwS3ZjUN9aaeuhyeoT/fD68O3cKTF6c2Ri+nYaIDHtNeIgNeJE6OCLbYnRgb7vM4wOkg6eVqEwJuKb2HHnZ5DEkyc/RCR1enEDNY4x+RQmQTu6+XtLxH28GPyznbmQ8yXmhWyWvhY56XFsuOpKmdTNVrhANJPFZjj8eQYU38Momgkf4hqj4xqVUX8vdcyqk0ou41qVbLh9JFptvii8NP3MitjiGqHqJaSJfEsMVyv8cqm1x7T8w8YYhKtktD1Fsb2A/ZQ5NNA9pY8pef19BLlTbTxVzH2iRFwwOeJrIFb5JQffB0i2o4cEE9a706I15EBae8U6xLN/0V7K9T9eWJttQbWZtv56YeVy5o2n/9h5RYw+qf3+A58/PDmYsG4qZ35N027PaL1wq1Y1NBkS1anujac0NbKwTRV33BAKxzkpQW1qqXLdA3cKupUnfb3napMrC8QO4SwR4ewNa4ElJQdh7DqTJuOpnqLVwJO94CsjCGEHULYpJAPPpNlv8jeu5Acy5LpCOj+osHs78gN4AY8evgBxyGycbTxHBiSaNdNo11vE2Gp+mcS89IS9Q3wh9i2Oz/EE9KXL+LJ/xYiWU5vzvaUtruggNeHb/aQtpsAIenjcEbb4Rktd94u5Ii2Ttqo3SPa92iFXPAZRkSes+whH7T1G2WRTfHW8/L/lgKus0sbs/SP+Q//BxvQAv4zvAAA \ No newline at end of file From 5c17e768b26789df1e8bf120eeebab93854a716e Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 23 Nov 2020 16:01:23 +0100 Subject: [PATCH 32/34] set wf configuration with spark.dynamicAllocation.maxExecutors 20 over 20 input partitions --- .../orcid/SparkDownloadOrcidAuthors.java | 39 +++++++++++++------ .../oozie_app/workflow.xml | 4 +- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index 850a654d4..68f44541a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -65,9 +65,14 @@ public class SparkDownloadOrcidAuthors { spark -> { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsedRecords"); - LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("modifiedRecords"); - LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloadedRecords"); + LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records"); + LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records"); + LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records"); + LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403"); + LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409"); + LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503"); + LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525"); + LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic"); logger.info("Retrieving data from lamda sequence file"); JavaPairRDD lamdaFileRDD = sc @@ -99,6 +104,18 @@ public class SparkDownloadOrcidAuthors { int statusCode = response.getStatusLine().getStatusCode(); downloaded.setStatusCode(statusCode); if (statusCode != 200) { + switch (statusCode) { + case 403: + errorHTTP403Acc.add(1); + case 409: + errorHTTP409Acc.add(1); + case 503: + errorHTTP503Acc.add(1); + case 525: + errorHTTP525Acc.add(1); + default: + errorHTTPGenericAcc.add(1); + } logger .info( "Downloading " + orcidId + " status code: " @@ -106,10 +123,6 @@ public class SparkDownloadOrcidAuthors { return downloaded.toTuple2(); } downloadedRecordsAcc.add(1); - long currentDownloaded = downloadedRecordsAcc.value(); - if ((currentDownloaded % 10000) == 0) { - logger.info("Current downloaded: " + currentDownloaded); - } downloaded .setCompressedData( ArgumentApplicationParser @@ -125,14 +138,11 @@ public class SparkDownloadOrcidAuthors { sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true"); logger.info("Start execution ..."); -// List> sampleList = lamdaFileRDD.take(500); -// JavaRDD> sampleRDD = sc.parallelize(sampleList); -// sampleRDD - JavaPairRDD authorsModifiedRDD = lamdaFileRDD - .filter(isModifiedAfterFilter); + JavaPairRDD authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter); logger.info("Authors modified count: " + authorsModifiedRDD.count()); logger.info("Start downloading ..."); authorsModifiedRDD + .repartition(20) .map(downloadRecordFunction) .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) .saveAsNewAPIHadoopFile( @@ -144,6 +154,11 @@ public class SparkDownloadOrcidAuthors { logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString()); logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString()); logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString()); + logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString()); + logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString()); + logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString()); + logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString()); + logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString()); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml index 5f728d35b..1c2a7b588 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml @@ -149,9 +149,9 @@ eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors dhp-doiboost-${projectVersion}.jar - --num-executors=${sparkExecutorNumber} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From 99a086f0c68f17489e4b39e32e9fbbb24418b21d Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 24 Nov 2020 17:49:32 +0100 Subject: [PATCH 33/34] max concurrent executors set to 10, according to ORCID Director of Technology mail request --- .../orcid/SparkDownloadOrcidAuthors.java | 17 +++++-- .../oozie_app/workflow.xml | 6 +-- .../doiboost/orcid/OrcidClientTest.java | 47 +++++++++++++++++-- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index 68f44541a..598835a00 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -100,7 +100,13 @@ public class SparkDownloadOrcidAuthors { HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Authorization", String.format("Bearer %s", token)); + long startReq = System.currentTimeMillis(); CloseableHttpResponse response = client.execute(httpGet); + long endReq = System.currentTimeMillis(); + long reqTime = endReq - startReq; + if (reqTime < 1000) { + Thread.sleep(1000 - reqTime); + } int statusCode = response.getStatusLine().getStatusCode(); downloaded.setStatusCode(statusCode); if (statusCode != 200) { @@ -111,15 +117,16 @@ public class SparkDownloadOrcidAuthors { errorHTTP409Acc.add(1); case 503: errorHTTP503Acc.add(1); + throw new RuntimeException("Orcid request rate limit reached (HTTP 503)"); case 525: errorHTTP525Acc.add(1); default: errorHTTPGenericAcc.add(1); + logger + .info( + "Downloading " + orcidId + " status code: " + + response.getStatusLine().getStatusCode()); } - logger - .info( - "Downloading " + orcidId + " status code: " - + response.getStatusLine().getStatusCode()); return downloaded.toTuple2(); } downloadedRecordsAcc.add(1); @@ -142,7 +149,7 @@ public class SparkDownloadOrcidAuthors { logger.info("Authors modified count: " + authorsModifiedRDD.count()); logger.info("Start downloading ..."); authorsModifiedRDD - .repartition(20) + .repartition(10) .map(downloadRecordFunction) .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) .saveAsNewAPIHadoopFile( diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml index 1c2a7b588..b9383558c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml @@ -14,10 +14,6 @@ the shell command that downloads the lambda file from orcid containing last orcid update informations - - sparkExecutorNumber - 20 - sparkDriverMemory 7G @@ -35,7 +31,7 @@ spark2MaxExecutors - 20 + 10 oozieActionShareLibForSpark2 diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index d6ce99f1c..66a7badb7 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -10,6 +10,9 @@ import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.temporal.TemporalUnit; import java.util.Arrays; import java.util.Date; import java.util.List; @@ -24,6 +27,7 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull; import org.junit.jupiter.api.Test; +import org.mortbay.log.Log; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import jdk.nashorn.internal.ir.annotations.Ignore; @@ -45,7 +49,7 @@ public class OrcidClientTest { @Test private void multipleDownloadTest() throws Exception { - int toDownload = 1; + int toDownload = 10; long start = System.currentTimeMillis(); OrcidDownloader downloader = new OrcidDownloader(); TarArchiveInputStream input = new TarArchiveInputStream( @@ -64,7 +68,7 @@ public class OrcidClientTest { List recordInfo = Arrays.asList(values); String orcidId = recordInfo.get(0); if (downloader.isModified(orcidId, recordInfo.get(3))) { - downloadTest(orcidId); + slowedDownDownload(orcidId); modified++; } rowNum++; @@ -181,7 +185,7 @@ public class OrcidClientTest { } @Test - public void testReadBase64CompressedRecord() throws Exception { + private void testReadBase64CompressedRecord() throws Exception { final String base64CompressedRecord = IOUtils .toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); @@ -257,4 +261,41 @@ public class OrcidClientTest { Path path = Paths.get("/tmp/orcid_log.txt"); Files.write(path, log.getBytes(), StandardOpenOption.APPEND); } + + @Test + private void slowedDownDownloadTest() throws Exception { + String orcid = "0000-0001-5496-1243"; + String record = slowedDownDownload(orcid); + String filename = "/tmp/downloaded_".concat(orcid).concat(".xml"); + File f = new File(filename); + OutputStream outStream = new FileOutputStream(f); + IOUtils.write(record.getBytes(), outStream); + } + + private String slowedDownDownload(String orcidId) throws Exception { + try (CloseableHttpClient client = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); + httpGet.addHeader("Accept", "application/vnd.orcid+xml"); + httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d"); + long start = System.currentTimeMillis(); + CloseableHttpResponse response = client.execute(httpGet); + long endReq = System.currentTimeMillis(); + long reqSessionDuration = endReq - start; + logToFile("req time (millisec): " + reqSessionDuration); + if (reqSessionDuration < 1000) { + logToFile("wait ...."); + Thread.sleep(1000 - reqSessionDuration); + } + long end = System.currentTimeMillis(); + long total = end - start; + logToFile("total time (millisec): " + total); + if (response.getStatusLine().getStatusCode() != 200) { + logToFile("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode()); + } + return IOUtils.toString(response.getEntity().getContent()); + } catch (Throwable e) { + e.printStackTrace(); + } + return new String(""); + } } From 40c4559e923c5e2f0124477dc8deb8bfe2ef22d5 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 30 Nov 2020 14:19:22 +0100 Subject: [PATCH 34/34] added datainfo on authors pid with "sysimport:crosswalk:entityregistry", --- .../doiboost/orcidnodoi/oaf/PublicationToOaf.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index ece59c3f1..18fecc6c2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -144,7 +144,7 @@ public class PublicationToOaf implements Serializable { publication.setLastupdatetimestamp(new Date().getTime()); - publication.setDateofcollection("2019-10-22"); + publication.setDateofcollection("2020-10-14"); publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); // Adding external ids @@ -526,6 +526,18 @@ public class PublicationToOaf implements Serializable { q.setSchemeid(ModelConstants.DNET_PID_TYPES); q.setSchemename(ModelConstants.DNET_PID_TYPES); sp.setQualifier(q); + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferred(false); + dataInfo.setTrust("0.9"); + dataInfo + .setProvenanceaction( + mapQualifier( + "sysimport:crosswalk:entityregistry", + "Harvested", + "dnet:provenanceActions", + "dnet:provenanceActions")); + sp.setDataInfo(dataInfo); return sp; } }