From fcbb4c148929c06756756f18a14cece1a5c232b3 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 24 Jun 2020 16:29:32 +0200 Subject: [PATCH] parser of orcid publication data from xml original dump --- dhp-workflows/dhp-doiboost/pom.xml | 6 + .../doiboost/orcid/model/AuthorData.java | 9 + .../orcidnodoi/model/Contributor.java | 54 +++ .../doiboost/orcidnodoi/model/ExternalId.java | 32 ++ .../orcidnodoi/model/PublicationDate.java | 32 ++ .../orcidnodoi/model/WorkDataNoDoi.java | 101 ++++++ .../orcidnodoi/xml/XMLRecordParserNoDoi.java | 216 ++++++++++++ .../orcid/xml/XMLRecordParserTest.java | 2 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 326 ++++++++++++++++++ .../xml/activity_work_0000-0003-2760-1191.xml | 106 ++++++ .../xml/activity_work_0000-0002-5982-8983.xml | 0 ...ty_work_0000-0003-2760-1191-similarity.xml | 113 ++++++ .../xml/activity_work_0000-0003-2760-1191.xml | 106 ++++++ 13 files changed, 1102 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml rename dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/{orcid => orcidnodoi}/xml/activity_work_0000-0002-5982-8983.xml (100%) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 39bb81ec1..2662d0a39 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -84,6 +84,12 @@ spark-sql_2.11 + + org.apache.commons + commons-text + 1.8 + + diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java index 29551c347..87f1f65c8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java @@ -9,6 +9,7 @@ public class AuthorData implements Serializable { private String name; private String surname; private String creditName; + private String otherName; private String errorCode; public String getErrorCode() { @@ -50,4 +51,12 @@ public class AuthorData implements Serializable { public void setOid(String oid) { this.oid = oid; } + + public String getOtherName() { + return otherName; + } + + public void setOtherName(String otherName) { + this.otherName = otherName; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java new file mode 100644 index 000000000..42076de5d --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +import java.io.Serializable; + +import eu.dnetlib.doiboost.orcid.model.AuthorData; + +public class Contributor extends AuthorData implements Serializable { + private String sequence; + private String role; + private boolean simpleMatch = false; + private Double score = 0.0; + private boolean bestMatch = false; + + public String getSequence() { + return sequence; + } + + public void setSequence(String sequence) { + this.sequence = sequence; + } + + public String getRole() { + return role; + } + + public void setRole(String role) { + this.role = role; + } + + public boolean isSimpleMatch() { + return simpleMatch; + } + + public void setSimpleMatch(boolean simpleMatch) { + this.simpleMatch = simpleMatch; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public boolean isBestMatch() { + return bestMatch; + } + + public void setBestMatch(boolean bestMatch) { + this.bestMatch = bestMatch; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java new file mode 100644 index 000000000..865e54ae3 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +public class ExternalId { + private String type; + private String value; + private String relationShip; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public String getRelationShip() { + return relationShip; + } + + public void setRelationShip(String relationShip) { + this.relationShip = relationShip; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java new file mode 100644 index 000000000..9282a80ba --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +public class PublicationDate { + private String year; + private String month; + private String day; + + public String getYear() { + return year; + } + + public void setYear(String year) { + this.year = year; + } + + public String getMonth() { + return month; + } + + public void setMonth(String month) { + this.month = month; + } + + public String getDay() { + return day; + } + + public void setDay(String day) { + this.day = day; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java new file mode 100644 index 000000000..ee13454e1 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java @@ -0,0 +1,101 @@ + +package eu.dnetlib.doiboost.orcidnodoi.model; + +import java.io.Serializable; +import java.util.List; + +public class WorkDataNoDoi implements Serializable { + + private String oid; + private String id; + private String sourceName; + private String type; + private List titles; + private List urls; + List extIds; + List publicationDates; + List contributors; + + public String getOid() { + return oid; + } + + public void setOid(String oid) { + this.oid = oid; + } + + public String getErrorCode() { + return errorCode; + } + + public void setErrorCode(String errorCode) { + this.errorCode = errorCode; + } + + private String errorCode; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getTitles() { + return titles; + } + + public void setTitles(List titles) { + this.titles = titles; + } + + public String getSourceName() { + return sourceName; + } + + public void setSourceName(String sourceName) { + this.sourceName = sourceName; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public List getUrls() { + return urls; + } + + public void setUrls(List urls) { + this.urls = urls; + } + + public List getExtIds() { + return extIds; + } + + public void setExtIds(List extIds) { + this.extIds = extIds; + } + + public List getPublicationDates() { + return publicationDates; + } + + public void setPublicationDates(List publicationDates) { + this.publicationDates = publicationDates; + } + + public List getContributors() { + return contributors; + } + + public void setContributors(List contributors) { + this.contributors = contributors; + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java new file mode 100644 index 000000000..6e5771547 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -0,0 +1,216 @@ + +package eu.dnetlib.doiboost.orcidnodoi.xml; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ximpleware.*; + +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId; +import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; + +public class XMLRecordParserNoDoi { + + private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class); + + private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common"; + private static final String NS_COMMON = "common"; + private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person"; + private static final String NS_PERSON = "person"; + private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details"; + private static final String NS_DETAILS = "personal-details"; + private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name"; + private static final String NS_OTHER = "other-name"; + private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; + private static final String NS_RECORD = "record"; + private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; + + private static final String NS_WORK = "work"; + private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; + + private static final String NS_ERROR = "error"; + + public static WorkDataNoDoi VTDParseWorkData(byte[] bytes) + throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException, + NavException, XPathEvalException { + logger.info("parsing xml ..."); + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + + WorkDataNoDoi workData = new WorkDataNoDoi(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + workData.setErrorCode(errors.get(0)); + return workData; + } + + List workNodes = VtdUtilityParser + .getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code")); + if (!workNodes.isEmpty()) { + final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1]; + workData.setOid(oid); + final String id = (workNodes.get(0).getAttributes().get("put-code")); + workData.setId(id); + } else { + return null; + } + + final List titles = VtdUtilityParser + .getTextValue( + ap, vn, "//common:title"); + if (!titles.isEmpty()) { + workData.setTitles(titles); + } + + final List sourceNames = VtdUtilityParser + .getTextValue( + ap, vn, "//common:source-name"); + if (!sourceNames.isEmpty()) { + workData.setSourceName(sourceNames.get(0)); + } + + final List types = VtdUtilityParser + .getTextValue( + ap, vn, "//work:type"); + if (!types.isEmpty()) { + workData.setType(types.get(0)); + } + + final List urls = VtdUtilityParser + .getTextValue( + ap, vn, "//common:url"); + if (!urls.isEmpty()) { + workData.setUrls(urls); + } + + workData.setPublicationDates(getPublicationDates(vg, vn, ap)); + workData.setExtIds(getExternalIds(vg, vn, ap)); + workData.setContributors(getContributors(vg, vn, ap)); + return workData; + + } + + private static List getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap) + throws XPathParseException, NavException, XPathEvalException { + List publicationDates = new ArrayList(); + int yearIndex = 0; + ap.selectXPath("//common:publication-date/common:year"); + while (ap.evalXPath() != -1) { + PublicationDate publicationDate = new PublicationDate(); + int t = vn.getText(); + if (t >= 0) { + publicationDate.setYear(vn.toNormalizedString(t)); + publicationDates.add(yearIndex, publicationDate); + yearIndex++; + } + } + int monthIndex = 0; + ap.selectXPath("//common:publication-date/common:month"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t)); + monthIndex++; + } + } + int dayIndex = 0; + ap.selectXPath("//common:publication-date/common:day"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t)); + dayIndex++; + } + } + return publicationDates; + } + + private static List getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap) + throws XPathParseException, NavException, XPathEvalException { + List extIds = new ArrayList(); + int typeIndex = 0; + ap.selectXPath("//common:external-id/common:external-id-type"); + while (ap.evalXPath() != -1) { + ExternalId extId = new ExternalId(); + int t = vn.getText(); + if (t >= 0) { + extId.setType(vn.toNormalizedString(t)); + extIds.add(typeIndex, extId); + typeIndex++; + } + } + int valueIndex = 0; + ap.selectXPath("//common:external-id/common:external-id-value"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + extIds.get(valueIndex).setValue(vn.toNormalizedString(t)); + valueIndex++; + } + } + int relationshipIndex = 0; + ap.selectXPath("//common:external-id/common:external-id-relationship"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t)); + relationshipIndex++; + } + } + if (typeIndex == valueIndex) { + return extIds; + } + return new ArrayList(); + } + + private static List getContributors(VTDGen vg, VTDNav vn, AutoPilot ap) + throws XPathParseException, NavException, XPathEvalException { + List contributors = new ArrayList(); + int nameIndex = 0; + ap.selectXPath("//work:contributor/work:credit-name"); + while (ap.evalXPath() != -1) { + Contributor contributor = new Contributor(); + int t = vn.getText(); + if (t >= 0) { + contributor.setCreditName(vn.toNormalizedString(t)); + contributors.add(nameIndex, contributor); + nameIndex++; + } + } + + int sequenceIndex = 0; + ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t)); + sequenceIndex++; + } + } + + int roleIndex = 0; + ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role"); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t >= 0) { + contributors.get(roleIndex).setRole(vn.toNormalizedString(t)); + roleIndex++; + } + } + return contributors; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index d5da4eec0..4d8237f77 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -44,7 +44,7 @@ public class XMLRecordParserTest { String xml = IOUtils .toString( - this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml")); + this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); XMLRecordParser p = new XMLRecordParser(); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java new file mode 100644 index 000000000..31f8432ac --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -0,0 +1,326 @@ + +package eu.dnetlib.doiboost.orcidnodoi.xml; + +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import jdk.nashorn.internal.ir.annotations.Ignore; +import org.apache.commons.io.IOUtils; +import org.apache.commons.text.similarity.JaccardSimilarity; +import org.apache.commons.text.similarity.JaroWinklerSimilarity; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.text.Normalizer; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class OrcidNoDoiTest { + + private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class); + + String nameA = "Khairy"; + String surnameA = "Abdel Dayem"; + String otherNameA = "Dayem MKA"; + String nameB = "K"; + String surnameB = "Abdel-Dayem"; + String orcidIdA = "0000-0003-2760-1191"; + Double threshold = 0.8; + + @Test + @Ignore + private void similarityTest() throws Exception { + logger.info("running testSimilarity ...."); + logger + .info( + "JaroWinklerSimilarity: " + + Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB))); + logger + .info( + "JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB))); + } + + @Test + @Ignore + private void bestMatchTest() throws Exception { + logger.info("running bestMatchTest ...."); + String contributor = surnameB + ", " + nameB; + logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor))); + } + + private static Double bestMatch(String authorSurname, String authorName, String contributor) { + logger.debug(authorSurname + " " + authorName + " vs " + contributor); + String[] contributorSplitted = contributor.split(" "); + if (contributorSplitted.length == 0) { + return 0.0; + } + final String contributorName = contributorSplitted[contributorSplitted.length - 1]; + String contributorSurname = ""; + if (contributorSplitted.length > 1) { + StringJoiner joiner = new StringJoiner(" "); + for (int i = 0; i < contributorSplitted.length - 1; i++) { + joiner.add(contributorSplitted[i]); + } + contributorSurname = joiner.toString(); + } + logger + .debug( + "contributorName: " + contributorName + + " contributorSurname: " + contributorSurname); + String authorNameNrm = normalize(authorName); + String authorSurnameNrm = normalize(authorSurname); + String contributorNameNrm = normalize(contributorName); + String contributorSurnameNrm = normalize(contributorSurname); + Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); + Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); + if (sm1.compareTo(sm2) >= 0) { + return sm1; + } + return sm2; + } + + private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { + Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); + logger + .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); + return score; + } + + private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) { + return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); + } + + private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { + return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); + } + + private static String parse(String name, String surname) { + return surname + " " + name; + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + @Test + @Ignore + public void readPublicationFieldsTest() + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + logger.info("running loadPublicationFieldsTest ...."); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData); + assertNotNull(workData.getOid()); + logger.info("oid: " + workData.getOid()); + assertNotNull(workData.getTitles()); + logger.info("titles: "); + workData.getTitles().forEach(t -> { + logger.info(t); + }); + logger.info("source: " + workData.getSourceName()); + logger.info("type: " + workData.getType()); + logger.info("urls: "); + workData.getUrls().forEach(u -> { + logger.info(u); + }); + logger.info("publication date: "); + workData.getPublicationDates().forEach(d -> { + logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay()); + }); + logger.info("external id: "); + workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self")); + workData.getExtIds().forEach(e -> { + logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip()); + }); + logger.info("contributors: "); + workData.getContributors().forEach(c -> { + logger + .info( + c.getName() + " - " + c.getRole() + " - " + c.getSequence()); + }); + + } + + private void updateRanks(List contributors) { + boolean seqFound = false; + if (contributors + .stream() + .filter( + c -> c.getRole() != null && c.getSequence() != null && + c.getRole().equals("author") && (c.getSequence().equals("first") || + c.getSequence().equals("additional"))) + .count() > 0) { + seqFound = true; + logger.info("sequence data found"); + } + if (!seqFound) { + List seqIds = Arrays.asList(0); + contributors.forEach(c -> { + int currentSeq = seqIds.get(0) + 1; + seqIds.set(0, currentSeq); + c.setSequence(Integer.toString(seqIds.get(0))); + }); + } + } + + private void updateAuthorsSimpleMatch(List contributors, AuthorData author) { + contributors.forEach(c -> { + if (c.isSimpleMatch()) { + logger.info("simple match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + } + }); + updateRanks(contributors); + } + + private void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { + logger.info("inside updateAuthorsSimilarityMatch ..."); + contributors.forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " + + c.isSimpleMatch()); + }); + + contributors + .stream() + .filter(c -> c.isBestMatch()) + .forEach(c -> { + logger.info("similarity match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + }); + updateRanks(contributors); + } + + @Test + @Ignore + public void authorSimilarityMatchTest() throws Exception { + logger.info("running authorSimilarityMatchTest ...."); + authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml"); + } + + @Test + private void authorSimpleMatchTest() throws Exception { + logger.info("running authorSimpleMatchTest ...."); + authorMatchTest("activity_work_0000-0003-2760-1191.xml"); + } + + private void authorMatchTest(String orcidWork) + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + AuthorData author = new AuthorData(); + author.setName(nameA); + author.setSurname(surnameA); + author.setOid(orcidIdA); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData); + int matchCounter = 0; + List matchCounters = Arrays.asList(matchCounter); + Contributor contributor = null; + workData.getContributors().forEach(c -> { + if (normalize(c.getCreditName()).contains(normalize(author.getName())) || + normalize(c.getCreditName()).contains(normalize(author.getSurname())) || + ((author.getOtherName() != null) + && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); + logger.info("match counter: " + Integer.toString(matchCounters.get(0))); + if (matchCounters.get(0) == 1) { + updateAuthorsSimpleMatch(workData.getContributors(), author); + } else if (matchCounters.get(0) > 1) { + Optional optCon = workData + .getContributors() + .stream() + .filter(c -> c.isSimpleMatch()) + .map(c -> { + c.setScore(bestMatch(nameA, surnameA, c.getCreditName())); + logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); + return c; + }) + .filter(c -> c.getScore() >= threshold) + .max(Comparator.comparing(c -> c.getScore())); + Contributor bestMatchContributor = null; + if (optCon.isPresent()) { + bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + logger.info("best match: " + bestMatchContributor.getCreditName()); + updateAuthorsSimilarityMatch(workData.getContributors(), author); + } + + } + + logger.info("UPDATED contributors: "); + workData.getContributors().forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence()); + }); + } +} + +// +// orcid_RDD = sc.textFile(ORCID_DUMP_PATH) +// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec") +// \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml new file mode 100644 index 000000000..485f4f8e8 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml @@ -0,0 +1,106 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml rename to dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml new file mode 100644 index 000000000..650d5a4cb --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml @@ -0,0 +1,113 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Abdel-Dayem Fake + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml new file mode 100644 index 000000000..485f4f8e8 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml @@ -0,0 +1,106 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + +