diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index 39bb81ec13..2662d0a399 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -84,6 +84,12 @@
spark-sql_2.11
+
+ org.apache.commons
+ commons-text
+ 1.8
+
+
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
index 29551c347a..87f1f65c8d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
@@ -9,6 +9,7 @@ public class AuthorData implements Serializable {
private String name;
private String surname;
private String creditName;
+ private String otherName;
private String errorCode;
public String getErrorCode() {
@@ -50,4 +51,12 @@ public class AuthorData implements Serializable {
public void setOid(String oid) {
this.oid = oid;
}
+
+ public String getOtherName() {
+ return otherName;
+ }
+
+ public void setOtherName(String otherName) {
+ this.otherName = otherName;
+ }
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
new file mode 100644
index 0000000000..42076de5de
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
@@ -0,0 +1,54 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+import java.io.Serializable;
+
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+
+public class Contributor extends AuthorData implements Serializable {
+ private String sequence;
+ private String role;
+ private boolean simpleMatch = false;
+ private Double score = 0.0;
+ private boolean bestMatch = false;
+
+ public String getSequence() {
+ return sequence;
+ }
+
+ public void setSequence(String sequence) {
+ this.sequence = sequence;
+ }
+
+ public String getRole() {
+ return role;
+ }
+
+ public void setRole(String role) {
+ this.role = role;
+ }
+
+ public boolean isSimpleMatch() {
+ return simpleMatch;
+ }
+
+ public void setSimpleMatch(boolean simpleMatch) {
+ this.simpleMatch = simpleMatch;
+ }
+
+ public Double getScore() {
+ return score;
+ }
+
+ public void setScore(Double score) {
+ this.score = score;
+ }
+
+ public boolean isBestMatch() {
+ return bestMatch;
+ }
+
+ public void setBestMatch(boolean bestMatch) {
+ this.bestMatch = bestMatch;
+ }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
new file mode 100644
index 0000000000..865e54ae37
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
@@ -0,0 +1,32 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+public class ExternalId {
+ private String type;
+ private String value;
+ private String relationShip;
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+
+ public String getRelationShip() {
+ return relationShip;
+ }
+
+ public void setRelationShip(String relationShip) {
+ this.relationShip = relationShip;
+ }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
new file mode 100644
index 0000000000..9282a80ba2
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
@@ -0,0 +1,32 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+public class PublicationDate {
+ private String year;
+ private String month;
+ private String day;
+
+ public String getYear() {
+ return year;
+ }
+
+ public void setYear(String year) {
+ this.year = year;
+ }
+
+ public String getMonth() {
+ return month;
+ }
+
+ public void setMonth(String month) {
+ this.month = month;
+ }
+
+ public String getDay() {
+ return day;
+ }
+
+ public void setDay(String day) {
+ this.day = day;
+ }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
new file mode 100644
index 0000000000..ee13454e1b
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
@@ -0,0 +1,101 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+import java.io.Serializable;
+import java.util.List;
+
+public class WorkDataNoDoi implements Serializable {
+
+ private String oid;
+ private String id;
+ private String sourceName;
+ private String type;
+ private List titles;
+ private List urls;
+ List extIds;
+ List publicationDates;
+ List contributors;
+
+ public String getOid() {
+ return oid;
+ }
+
+ public void setOid(String oid) {
+ this.oid = oid;
+ }
+
+ public String getErrorCode() {
+ return errorCode;
+ }
+
+ public void setErrorCode(String errorCode) {
+ this.errorCode = errorCode;
+ }
+
+ private String errorCode;
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public List getTitles() {
+ return titles;
+ }
+
+ public void setTitles(List titles) {
+ this.titles = titles;
+ }
+
+ public String getSourceName() {
+ return sourceName;
+ }
+
+ public void setSourceName(String sourceName) {
+ this.sourceName = sourceName;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public List getUrls() {
+ return urls;
+ }
+
+ public void setUrls(List urls) {
+ this.urls = urls;
+ }
+
+ public List getExtIds() {
+ return extIds;
+ }
+
+ public void setExtIds(List extIds) {
+ this.extIds = extIds;
+ }
+
+ public List getPublicationDates() {
+ return publicationDates;
+ }
+
+ public void setPublicationDates(List publicationDates) {
+ this.publicationDates = publicationDates;
+ }
+
+ public List getContributors() {
+ return contributors;
+ }
+
+ public void setContributors(List contributors) {
+ this.contributors = contributors;
+ }
+
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
new file mode 100644
index 0000000000..6e5771547a
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
@@ -0,0 +1,216 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.xml;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ximpleware.*;
+
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
+import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+
+public class XMLRecordParserNoDoi {
+
+ private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
+
+ private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
+ private static final String NS_COMMON = "common";
+ private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
+ private static final String NS_PERSON = "person";
+ private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
+ private static final String NS_DETAILS = "personal-details";
+ private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
+ private static final String NS_OTHER = "other-name";
+ private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
+ private static final String NS_RECORD = "record";
+ private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
+
+ private static final String NS_WORK = "work";
+ private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
+
+ private static final String NS_ERROR = "error";
+
+ public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
+ throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
+ NavException, XPathEvalException {
+ logger.info("parsing xml ...");
+ final VTDGen vg = new VTDGen();
+ vg.setDoc(bytes);
+ vg.parse(true);
+ final VTDNav vn = vg.getNav();
+ final AutoPilot ap = new AutoPilot(vn);
+ ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
+ ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
+ ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
+
+ WorkDataNoDoi workData = new WorkDataNoDoi();
+ final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
+ if (!errors.isEmpty()) {
+ workData.setErrorCode(errors.get(0));
+ return workData;
+ }
+
+ List workNodes = VtdUtilityParser
+ .getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code"));
+ if (!workNodes.isEmpty()) {
+ final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
+ workData.setOid(oid);
+ final String id = (workNodes.get(0).getAttributes().get("put-code"));
+ workData.setId(id);
+ } else {
+ return null;
+ }
+
+ final List titles = VtdUtilityParser
+ .getTextValue(
+ ap, vn, "//common:title");
+ if (!titles.isEmpty()) {
+ workData.setTitles(titles);
+ }
+
+ final List sourceNames = VtdUtilityParser
+ .getTextValue(
+ ap, vn, "//common:source-name");
+ if (!sourceNames.isEmpty()) {
+ workData.setSourceName(sourceNames.get(0));
+ }
+
+ final List types = VtdUtilityParser
+ .getTextValue(
+ ap, vn, "//work:type");
+ if (!types.isEmpty()) {
+ workData.setType(types.get(0));
+ }
+
+ final List urls = VtdUtilityParser
+ .getTextValue(
+ ap, vn, "//common:url");
+ if (!urls.isEmpty()) {
+ workData.setUrls(urls);
+ }
+
+ workData.setPublicationDates(getPublicationDates(vg, vn, ap));
+ workData.setExtIds(getExternalIds(vg, vn, ap));
+ workData.setContributors(getContributors(vg, vn, ap));
+ return workData;
+
+ }
+
+ private static List getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap)
+ throws XPathParseException, NavException, XPathEvalException {
+ List publicationDates = new ArrayList();
+ int yearIndex = 0;
+ ap.selectXPath("//common:publication-date/common:year");
+ while (ap.evalXPath() != -1) {
+ PublicationDate publicationDate = new PublicationDate();
+ int t = vn.getText();
+ if (t >= 0) {
+ publicationDate.setYear(vn.toNormalizedString(t));
+ publicationDates.add(yearIndex, publicationDate);
+ yearIndex++;
+ }
+ }
+ int monthIndex = 0;
+ ap.selectXPath("//common:publication-date/common:month");
+ while (ap.evalXPath() != -1) {
+ int t = vn.getText();
+ if (t >= 0) {
+ publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t));
+ monthIndex++;
+ }
+ }
+ int dayIndex = 0;
+ ap.selectXPath("//common:publication-date/common:day");
+ while (ap.evalXPath() != -1) {
+ int t = vn.getText();
+ if (t >= 0) {
+ publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t));
+ dayIndex++;
+ }
+ }
+ return publicationDates;
+ }
+
+ private static List getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap)
+ throws XPathParseException, NavException, XPathEvalException {
+ List extIds = new ArrayList();
+ int typeIndex = 0;
+ ap.selectXPath("//common:external-id/common:external-id-type");
+ while (ap.evalXPath() != -1) {
+ ExternalId extId = new ExternalId();
+ int t = vn.getText();
+ if (t >= 0) {
+ extId.setType(vn.toNormalizedString(t));
+ extIds.add(typeIndex, extId);
+ typeIndex++;
+ }
+ }
+ int valueIndex = 0;
+ ap.selectXPath("//common:external-id/common:external-id-value");
+ while (ap.evalXPath() != -1) {
+ int t = vn.getText();
+ if (t >= 0) {
+ extIds.get(valueIndex).setValue(vn.toNormalizedString(t));
+ valueIndex++;
+ }
+ }
+ int relationshipIndex = 0;
+ ap.selectXPath("//common:external-id/common:external-id-relationship");
+ while (ap.evalXPath() != -1) {
+ int t = vn.getText();
+ if (t >= 0) {
+ extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t));
+ relationshipIndex++;
+ }
+ }
+ if (typeIndex == valueIndex) {
+ return extIds;
+ }
+ return new ArrayList();
+ }
+
+ private static List getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
+ throws XPathParseException, NavException, XPathEvalException {
+ List contributors = new ArrayList();
+ int nameIndex = 0;
+ ap.selectXPath("//work:contributor/work:credit-name");
+ while (ap.evalXPath() != -1) {
+ Contributor contributor = new Contributor();
+ int t = vn.getText();
+ if (t >= 0) {
+ contributor.setCreditName(vn.toNormalizedString(t));
+ contributors.add(nameIndex, contributor);
+ nameIndex++;
+ }
+ }
+
+ int sequenceIndex = 0;
+ ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
+ while (ap.evalXPath() != -1) {
+ int t = vn.getText();
+ if (t >= 0) {
+ contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t));
+ sequenceIndex++;
+ }
+ }
+
+ int roleIndex = 0;
+ ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
+ while (ap.evalXPath() != -1) {
+ int t = vn.getText();
+ if (t >= 0) {
+ contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
+ roleIndex++;
+ }
+ }
+ return contributors;
+ }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
index d5da4eec02..4d8237f779 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
@@ -44,7 +44,7 @@ public class XMLRecordParserTest {
String xml = IOUtils
.toString(
- this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml"));
+ this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
XMLRecordParser p = new XMLRecordParser();
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
new file mode 100644
index 0000000000..31f8432ac9
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -0,0 +1,326 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.xml;
+
+import com.ximpleware.NavException;
+import com.ximpleware.ParseException;
+import com.ximpleware.XPathEvalException;
+import com.ximpleware.XPathParseException;
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import jdk.nashorn.internal.ir.annotations.Ignore;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.text.similarity.JaccardSimilarity;
+import org.apache.commons.text.similarity.JaroWinklerSimilarity;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.text.Normalizer;
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+public class OrcidNoDoiTest {
+
+ private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
+
+ String nameA = "Khairy";
+ String surnameA = "Abdel Dayem";
+ String otherNameA = "Dayem MKA";
+ String nameB = "K";
+ String surnameB = "Abdel-Dayem";
+ String orcidIdA = "0000-0003-2760-1191";
+ Double threshold = 0.8;
+
+ @Test
+ @Ignore
+ private void similarityTest() throws Exception {
+ logger.info("running testSimilarity ....");
+ logger
+ .info(
+ "JaroWinklerSimilarity: "
+ + Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
+ logger
+ .info(
+ "JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
+ }
+
+ @Test
+ @Ignore
+ private void bestMatchTest() throws Exception {
+ logger.info("running bestMatchTest ....");
+ String contributor = surnameB + ", " + nameB;
+ logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
+ }
+
+ private static Double bestMatch(String authorSurname, String authorName, String contributor) {
+ logger.debug(authorSurname + " " + authorName + " vs " + contributor);
+ String[] contributorSplitted = contributor.split(" ");
+ if (contributorSplitted.length == 0) {
+ return 0.0;
+ }
+ final String contributorName = contributorSplitted[contributorSplitted.length - 1];
+ String contributorSurname = "";
+ if (contributorSplitted.length > 1) {
+ StringJoiner joiner = new StringJoiner(" ");
+ for (int i = 0; i < contributorSplitted.length - 1; i++) {
+ joiner.add(contributorSplitted[i]);
+ }
+ contributorSurname = joiner.toString();
+ }
+ logger
+ .debug(
+ "contributorName: " + contributorName +
+ " contributorSurname: " + contributorSurname);
+ String authorNameNrm = normalize(authorName);
+ String authorSurnameNrm = normalize(authorSurname);
+ String contributorNameNrm = normalize(contributorName);
+ String contributorSurnameNrm = normalize(contributorSurname);
+ Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
+ Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
+ if (sm1.compareTo(sm2) >= 0) {
+ return sm1;
+ }
+ return sm2;
+ }
+
+ private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
+ Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
+ logger
+ .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
+ return score;
+ }
+
+ private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
+ return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
+ }
+
+ private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
+ return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
+ }
+
+ private static String parse(String name, String surname) {
+ return surname + " " + name;
+ }
+
+ private static String normalize(final String s) {
+ return nfd(s)
+ .toLowerCase()
+ // do not compact the regexes in a single expression, would cause StackOverflowError
+ // in case
+ // of large input strings
+ .replaceAll("(\\W)+", " ")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ private static String nfd(final String s) {
+ return Normalizer.normalize(s, Normalizer.Form.NFD);
+ }
+
+ @Test
+ @Ignore
+ public void readPublicationFieldsTest()
+ throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+ logger.info("running loadPublicationFieldsTest ....");
+ String xml = IOUtils
+ .toString(
+ OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
+
+ if (xml == null) {
+ logger.info("Resource not found");
+ }
+ XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+ if (p == null) {
+ logger.info("XMLRecordParserNoDoi null");
+ }
+ WorkDataNoDoi workData = null;
+ try {
+ workData = p.VTDParseWorkData(xml.getBytes());
+ } catch (Exception e) {
+ logger.error("parsing xml", e);
+ }
+ assertNotNull(workData);
+ assertNotNull(workData.getOid());
+ logger.info("oid: " + workData.getOid());
+ assertNotNull(workData.getTitles());
+ logger.info("titles: ");
+ workData.getTitles().forEach(t -> {
+ logger.info(t);
+ });
+ logger.info("source: " + workData.getSourceName());
+ logger.info("type: " + workData.getType());
+ logger.info("urls: ");
+ workData.getUrls().forEach(u -> {
+ logger.info(u);
+ });
+ logger.info("publication date: ");
+ workData.getPublicationDates().forEach(d -> {
+ logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay());
+ });
+ logger.info("external id: ");
+ workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self"));
+ workData.getExtIds().forEach(e -> {
+ logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip());
+ });
+ logger.info("contributors: ");
+ workData.getContributors().forEach(c -> {
+ logger
+ .info(
+ c.getName() + " - " + c.getRole() + " - " + c.getSequence());
+ });
+
+ }
+
+ private void updateRanks(List contributors) {
+ boolean seqFound = false;
+ if (contributors
+ .stream()
+ .filter(
+ c -> c.getRole() != null && c.getSequence() != null &&
+ c.getRole().equals("author") && (c.getSequence().equals("first") ||
+ c.getSequence().equals("additional")))
+ .count() > 0) {
+ seqFound = true;
+ logger.info("sequence data found");
+ }
+ if (!seqFound) {
+ List seqIds = Arrays.asList(0);
+ contributors.forEach(c -> {
+ int currentSeq = seqIds.get(0) + 1;
+ seqIds.set(0, currentSeq);
+ c.setSequence(Integer.toString(seqIds.get(0)));
+ });
+ }
+ }
+
+ private void updateAuthorsSimpleMatch(List contributors, AuthorData author) {
+ contributors.forEach(c -> {
+ if (c.isSimpleMatch()) {
+ logger.info("simple match on : " + c.getCreditName());
+ c.setName(author.getName());
+ c.setSurname(author.getSurname());
+ c.setOid(author.getOid());
+ }
+ });
+ updateRanks(contributors);
+ }
+
+ private void updateAuthorsSimilarityMatch(List contributors, AuthorData author) {
+ logger.info("inside updateAuthorsSimilarityMatch ...");
+ contributors.forEach(c -> {
+ logger
+ .info(
+ c.getOid() + " - " + c.getCreditName() + " - " +
+ c.getName() + " - " + c.getSurname() + " - " +
+ c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+ + c.isSimpleMatch());
+ });
+
+ contributors
+ .stream()
+ .filter(c -> c.isBestMatch())
+ .forEach(c -> {
+ logger.info("similarity match on : " + c.getCreditName());
+ c.setName(author.getName());
+ c.setSurname(author.getSurname());
+ c.setOid(author.getOid());
+ });
+ updateRanks(contributors);
+ }
+
+ @Test
+ @Ignore
+ public void authorSimilarityMatchTest() throws Exception {
+ logger.info("running authorSimilarityMatchTest ....");
+ authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
+ }
+
+ @Test
+ private void authorSimpleMatchTest() throws Exception {
+ logger.info("running authorSimpleMatchTest ....");
+ authorMatchTest("activity_work_0000-0003-2760-1191.xml");
+ }
+
+ private void authorMatchTest(String orcidWork)
+ throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+ AuthorData author = new AuthorData();
+ author.setName(nameA);
+ author.setSurname(surnameA);
+ author.setOid(orcidIdA);
+ String xml = IOUtils
+ .toString(
+ OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
+
+ if (xml == null) {
+ logger.info("Resource not found");
+ }
+ XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+ if (p == null) {
+ logger.info("XMLRecordParserNoDoi null");
+ }
+ WorkDataNoDoi workData = null;
+ try {
+ workData = p.VTDParseWorkData(xml.getBytes());
+ } catch (Exception e) {
+ logger.error("parsing xml", e);
+ }
+ assertNotNull(workData);
+ int matchCounter = 0;
+ List matchCounters = Arrays.asList(matchCounter);
+ Contributor contributor = null;
+ workData.getContributors().forEach(c -> {
+ if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
+ normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
+ ((author.getOtherName() != null)
+ && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
+ matchCounters.set(0, matchCounters.get(0) + 1);
+ c.setSimpleMatch(true);
+ }
+ });
+ logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
+ if (matchCounters.get(0) == 1) {
+ updateAuthorsSimpleMatch(workData.getContributors(), author);
+ } else if (matchCounters.get(0) > 1) {
+ Optional optCon = workData
+ .getContributors()
+ .stream()
+ .filter(c -> c.isSimpleMatch())
+ .map(c -> {
+ c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
+ logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
+ return c;
+ })
+ .filter(c -> c.getScore() >= threshold)
+ .max(Comparator.comparing(c -> c.getScore()));
+ Contributor bestMatchContributor = null;
+ if (optCon.isPresent()) {
+ bestMatchContributor = optCon.get();
+ bestMatchContributor.setBestMatch(true);
+ logger.info("best match: " + bestMatchContributor.getCreditName());
+ updateAuthorsSimilarityMatch(workData.getContributors(), author);
+ }
+
+ }
+
+ logger.info("UPDATED contributors: ");
+ workData.getContributors().forEach(c -> {
+ logger
+ .info(
+ c.getOid() + " - " + c.getCreditName() + " - " +
+ c.getName() + " - " + c.getSurname() + " - " +
+ c.getRole() + " - " + c.getSequence());
+ });
+ }
+}
+
+//
+// orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
+// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
+//
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml
new file mode 100644
index 0000000000..485f4f8e8c
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml
@@ -0,0 +1,106 @@
+
+
+ 2016-12-12T23:02:05.233Z
+ 2016-12-13T09:08:16.412Z
+
+
+ https://orcid.org/0000-0002-9157-3431
+ 0000-0002-9157-3431
+ orcid.org
+
+ Europe PubMed Central
+
+
+ Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+ Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+ ST-Segment-Elevation Myocardial Infarction.
+
+
+ formatted-unspecified
+ Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+ Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016
+
+ journal-article
+
+ 2016
+ 11
+
+
+
+ pmid
+ 27899851
+ 27899851
+ self
+
+
+ pmc
+ PMC5126442
+ PMC5126442
+ self
+
+
+ http://europepmc.org/abstract/med/27899851
+
+
+ Abdel-Dayem K
+
+ first
+ author
+
+
+
+ Eweda II
+
+ first
+ author
+
+
+
+ El-Sherbiny A
+
+ first
+ author
+
+
+
+ Dimitry MO
+
+ first
+ author
+
+
+
+ Nammas W
+
+ first
+ author
+
+
+
+
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml
similarity index 100%
rename from dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml
rename to dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml
new file mode 100644
index 0000000000..650d5a4cb4
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml
@@ -0,0 +1,113 @@
+
+
+ 2016-12-12T23:02:05.233Z
+ 2016-12-13T09:08:16.412Z
+
+
+ https://orcid.org/0000-0002-9157-3431
+ 0000-0002-9157-3431
+ orcid.org
+
+ Europe PubMed Central
+
+
+ Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+ Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+ ST-Segment-Elevation Myocardial Infarction.
+
+
+ formatted-unspecified
+ Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+ Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016
+
+ journal-article
+
+ 2016
+ 11
+
+
+
+ pmid
+ 27899851
+ 27899851
+ self
+
+
+ pmc
+ PMC5126442
+ PMC5126442
+ self
+
+
+ http://europepmc.org/abstract/med/27899851
+
+
+ Abdel-Dayem K
+
+ first
+ author
+
+
+
+ Abdel-Dayem Fake
+
+ first
+ author
+
+
+
+ Eweda II
+
+ first
+ author
+
+
+
+ El-Sherbiny A
+
+ first
+ author
+
+
+
+ Dimitry MO
+
+ first
+ author
+
+
+
+ Nammas W
+
+ first
+ author
+
+
+
+
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
new file mode 100644
index 0000000000..485f4f8e8c
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
@@ -0,0 +1,106 @@
+
+
+ 2016-12-12T23:02:05.233Z
+ 2016-12-13T09:08:16.412Z
+
+
+ https://orcid.org/0000-0002-9157-3431
+ 0000-0002-9157-3431
+ orcid.org
+
+ Europe PubMed Central
+
+
+ Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+ Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+ ST-Segment-Elevation Myocardial Infarction.
+
+
+ formatted-unspecified
+ Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+ Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016
+
+ journal-article
+
+ 2016
+ 11
+
+
+
+ pmid
+ 27899851
+ 27899851
+ self
+
+
+ pmc
+ PMC5126442
+ PMC5126442
+ self
+
+
+ http://europepmc.org/abstract/med/27899851
+
+
+ Abdel-Dayem K
+
+ first
+ author
+
+
+
+ Eweda II
+
+ first
+ author
+
+
+
+ El-Sherbiny A
+
+ first
+ author
+
+
+
+ Dimitry MO
+
+ first
+ author
+
+
+
+ Nammas W
+
+ first
+ author
+
+
+
+