From fcbb4c148929c06756756f18a14cece1a5c232b3 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Wed, 24 Jun 2020 16:29:32 +0200
Subject: [PATCH 01/34] parser of orcid publication data from xml original dump

---
 dhp-workflows/dhp-doiboost/pom.xml            |   6 +
 .../doiboost/orcid/model/AuthorData.java      |   9 +
 .../orcidnodoi/model/Contributor.java         |  54 +++
 .../doiboost/orcidnodoi/model/ExternalId.java |  32 ++
 .../orcidnodoi/model/PublicationDate.java     |  32 ++
 .../orcidnodoi/model/WorkDataNoDoi.java       | 101 ++++++
 .../orcidnodoi/xml/XMLRecordParserNoDoi.java  | 216 ++++++++++++
 .../orcid/xml/XMLRecordParserTest.java        |   2 +-
 .../orcidnodoi/xml/OrcidNoDoiTest.java        | 326 ++++++++++++++++++
 .../xml/activity_work_0000-0003-2760-1191.xml | 106 ++++++
 .../xml/activity_work_0000-0002-5982-8983.xml |   0
 ...ty_work_0000-0003-2760-1191-similarity.xml | 113 ++++++
 .../xml/activity_work_0000-0003-2760-1191.xml | 106 ++++++
 13 files changed, 1102 insertions(+), 1 deletion(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml
 rename dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/{orcid => orcidnodoi}/xml/activity_work_0000-0002-5982-8983.xml (100%)
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index 39bb81ec1..2662d0a39 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -84,6 +84,12 @@
             <artifactId>spark-sql_2.11</artifactId>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>1.8</version>
+        </dependency>
+
 
 
     </dependencies>
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
index 29551c347..87f1f65c8 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
@@ -9,6 +9,7 @@ public class AuthorData implements Serializable {
 	private String name;
 	private String surname;
 	private String creditName;
+	private String otherName;
 	private String errorCode;
 
 	public String getErrorCode() {
@@ -50,4 +51,12 @@ public class AuthorData implements Serializable {
 	public void setOid(String oid) {
 		this.oid = oid;
 	}
+
+	public String getOtherName() {
+		return otherName;
+	}
+
+	public void setOtherName(String otherName) {
+		this.otherName = otherName;
+	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
new file mode 100644
index 000000000..42076de5d
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
@@ -0,0 +1,54 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+import java.io.Serializable;
+
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+
+public class Contributor extends AuthorData implements Serializable {
+	private String sequence;
+	private String role;
+	private boolean simpleMatch = false;
+	private Double score = 0.0;
+	private boolean bestMatch = false;
+
+	public String getSequence() {
+		return sequence;
+	}
+
+	public void setSequence(String sequence) {
+		this.sequence = sequence;
+	}
+
+	public String getRole() {
+		return role;
+	}
+
+	public void setRole(String role) {
+		this.role = role;
+	}
+
+	public boolean isSimpleMatch() {
+		return simpleMatch;
+	}
+
+	public void setSimpleMatch(boolean simpleMatch) {
+		this.simpleMatch = simpleMatch;
+	}
+
+	public Double getScore() {
+		return score;
+	}
+
+	public void setScore(Double score) {
+		this.score = score;
+	}
+
+	public boolean isBestMatch() {
+		return bestMatch;
+	}
+
+	public void setBestMatch(boolean bestMatch) {
+		this.bestMatch = bestMatch;
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
new file mode 100644
index 000000000..865e54ae3
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
@@ -0,0 +1,32 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+public class ExternalId {
+	private String type;
+	private String value;
+	private String relationShip;
+
+	public String getType() {
+		return type;
+	}
+
+	public void setType(String type) {
+		this.type = type;
+	}
+
+	public String getValue() {
+		return value;
+	}
+
+	public void setValue(String value) {
+		this.value = value;
+	}
+
+	public String getRelationShip() {
+		return relationShip;
+	}
+
+	public void setRelationShip(String relationShip) {
+		this.relationShip = relationShip;
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
new file mode 100644
index 000000000..9282a80ba
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
@@ -0,0 +1,32 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+public class PublicationDate {
+	private String year;
+	private String month;
+	private String day;
+
+	public String getYear() {
+		return year;
+	}
+
+	public void setYear(String year) {
+		this.year = year;
+	}
+
+	public String getMonth() {
+		return month;
+	}
+
+	public void setMonth(String month) {
+		this.month = month;
+	}
+
+	public String getDay() {
+		return day;
+	}
+
+	public void setDay(String day) {
+		this.day = day;
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
new file mode 100644
index 000000000..ee13454e1
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
@@ -0,0 +1,101 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.model;
+
+import java.io.Serializable;
+import java.util.List;
+
+public class WorkDataNoDoi implements Serializable {
+
+	private String oid;
+	private String id;
+	private String sourceName;
+	private String type;
+	private List<String> titles;
+	private List<String> urls;
+	List<ExternalId> extIds;
+	List<PublicationDate> publicationDates;
+	List<Contributor> contributors;
+
+	public String getOid() {
+		return oid;
+	}
+
+	public void setOid(String oid) {
+		this.oid = oid;
+	}
+
+	public String getErrorCode() {
+		return errorCode;
+	}
+
+	public void setErrorCode(String errorCode) {
+		this.errorCode = errorCode;
+	}
+
+	private String errorCode;
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public List<String> getTitles() {
+		return titles;
+	}
+
+	public void setTitles(List<String> titles) {
+		this.titles = titles;
+	}
+
+	public String getSourceName() {
+		return sourceName;
+	}
+
+	public void setSourceName(String sourceName) {
+		this.sourceName = sourceName;
+	}
+
+	public String getType() {
+		return type;
+	}
+
+	public void setType(String type) {
+		this.type = type;
+	}
+
+	public List<String> getUrls() {
+		return urls;
+	}
+
+	public void setUrls(List<String> urls) {
+		this.urls = urls;
+	}
+
+	public List<ExternalId> getExtIds() {
+		return extIds;
+	}
+
+	public void setExtIds(List<ExternalId> extIds) {
+		this.extIds = extIds;
+	}
+
+	public List<PublicationDate> getPublicationDates() {
+		return publicationDates;
+	}
+
+	public void setPublicationDates(List<PublicationDate> publicationDates) {
+		this.publicationDates = publicationDates;
+	}
+
+	public List<Contributor> getContributors() {
+		return contributors;
+	}
+
+	public void setContributors(List<Contributor> contributors) {
+		this.contributors = contributors;
+	}
+
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
new file mode 100644
index 000000000..6e5771547
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
@@ -0,0 +1,216 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.xml;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ximpleware.*;
+
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
+import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+
+public class XMLRecordParserNoDoi {
+
+	private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
+
+	private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
+	private static final String NS_COMMON = "common";
+	private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
+	private static final String NS_PERSON = "person";
+	private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
+	private static final String NS_DETAILS = "personal-details";
+	private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
+	private static final String NS_OTHER = "other-name";
+	private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
+	private static final String NS_RECORD = "record";
+	private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
+
+	private static final String NS_WORK = "work";
+	private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
+
+	private static final String NS_ERROR = "error";
+
+	public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
+		throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
+		NavException, XPathEvalException {
+		logger.info("parsing xml ...");
+		final VTDGen vg = new VTDGen();
+		vg.setDoc(bytes);
+		vg.parse(true);
+		final VTDNav vn = vg.getNav();
+		final AutoPilot ap = new AutoPilot(vn);
+		ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
+		ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
+		ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
+
+		WorkDataNoDoi workData = new WorkDataNoDoi();
+		final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
+		if (!errors.isEmpty()) {
+			workData.setErrorCode(errors.get(0));
+			return workData;
+		}
+
+		List<VtdUtilityParser.Node> workNodes = VtdUtilityParser
+			.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code"));
+		if (!workNodes.isEmpty()) {
+			final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
+			workData.setOid(oid);
+			final String id = (workNodes.get(0).getAttributes().get("put-code"));
+			workData.setId(id);
+		} else {
+			return null;
+		}
+
+		final List<String> titles = VtdUtilityParser
+			.getTextValue(
+				ap, vn, "//common:title");
+		if (!titles.isEmpty()) {
+			workData.setTitles(titles);
+		}
+
+		final List<String> sourceNames = VtdUtilityParser
+			.getTextValue(
+				ap, vn, "//common:source-name");
+		if (!sourceNames.isEmpty()) {
+			workData.setSourceName(sourceNames.get(0));
+		}
+
+		final List<String> types = VtdUtilityParser
+			.getTextValue(
+				ap, vn, "//work:type");
+		if (!types.isEmpty()) {
+			workData.setType(types.get(0));
+		}
+
+		final List<String> urls = VtdUtilityParser
+			.getTextValue(
+				ap, vn, "//common:url");
+		if (!urls.isEmpty()) {
+			workData.setUrls(urls);
+		}
+
+		workData.setPublicationDates(getPublicationDates(vg, vn, ap));
+		workData.setExtIds(getExternalIds(vg, vn, ap));
+		workData.setContributors(getContributors(vg, vn, ap));
+		return workData;
+
+	}
+
+	private static List<PublicationDate> getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap)
+		throws XPathParseException, NavException, XPathEvalException {
+		List<PublicationDate> publicationDates = new ArrayList<PublicationDate>();
+		int yearIndex = 0;
+		ap.selectXPath("//common:publication-date/common:year");
+		while (ap.evalXPath() != -1) {
+			PublicationDate publicationDate = new PublicationDate();
+			int t = vn.getText();
+			if (t >= 0) {
+				publicationDate.setYear(vn.toNormalizedString(t));
+				publicationDates.add(yearIndex, publicationDate);
+				yearIndex++;
+			}
+		}
+		int monthIndex = 0;
+		ap.selectXPath("//common:publication-date/common:month");
+		while (ap.evalXPath() != -1) {
+			int t = vn.getText();
+			if (t >= 0) {
+				publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t));
+				monthIndex++;
+			}
+		}
+		int dayIndex = 0;
+		ap.selectXPath("//common:publication-date/common:day");
+		while (ap.evalXPath() != -1) {
+			int t = vn.getText();
+			if (t >= 0) {
+				publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t));
+				dayIndex++;
+			}
+		}
+		return publicationDates;
+	}
+
+	private static List<ExternalId> getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap)
+		throws XPathParseException, NavException, XPathEvalException {
+		List<ExternalId> extIds = new ArrayList<ExternalId>();
+		int typeIndex = 0;
+		ap.selectXPath("//common:external-id/common:external-id-type");
+		while (ap.evalXPath() != -1) {
+			ExternalId extId = new ExternalId();
+			int t = vn.getText();
+			if (t >= 0) {
+				extId.setType(vn.toNormalizedString(t));
+				extIds.add(typeIndex, extId);
+				typeIndex++;
+			}
+		}
+		int valueIndex = 0;
+		ap.selectXPath("//common:external-id/common:external-id-value");
+		while (ap.evalXPath() != -1) {
+			int t = vn.getText();
+			if (t >= 0) {
+				extIds.get(valueIndex).setValue(vn.toNormalizedString(t));
+				valueIndex++;
+			}
+		}
+		int relationshipIndex = 0;
+		ap.selectXPath("//common:external-id/common:external-id-relationship");
+		while (ap.evalXPath() != -1) {
+			int t = vn.getText();
+			if (t >= 0) {
+				extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t));
+				relationshipIndex++;
+			}
+		}
+		if (typeIndex == valueIndex) {
+			return extIds;
+		}
+		return new ArrayList<ExternalId>();
+	}
+
+	private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
+		throws XPathParseException, NavException, XPathEvalException {
+		List<Contributor> contributors = new ArrayList<Contributor>();
+		int nameIndex = 0;
+		ap.selectXPath("//work:contributor/work:credit-name");
+		while (ap.evalXPath() != -1) {
+			Contributor contributor = new Contributor();
+			int t = vn.getText();
+			if (t >= 0) {
+				contributor.setCreditName(vn.toNormalizedString(t));
+				contributors.add(nameIndex, contributor);
+				nameIndex++;
+			}
+		}
+
+		int sequenceIndex = 0;
+		ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
+		while (ap.evalXPath() != -1) {
+			int t = vn.getText();
+			if (t >= 0) {
+				contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t));
+				sequenceIndex++;
+			}
+		}
+
+		int roleIndex = 0;
+		ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
+		while (ap.evalXPath() != -1) {
+			int t = vn.getText();
+			if (t >= 0) {
+				contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
+				roleIndex++;
+			}
+		}
+		return contributors;
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
index d5da4eec0..4d8237f77 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
@@ -44,7 +44,7 @@ public class XMLRecordParserTest {
 
 		String xml = IOUtils
 			.toString(
-				this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml"));
+				this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
 
 		XMLRecordParser p = new XMLRecordParser();
 
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
new file mode 100644
index 000000000..31f8432ac
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -0,0 +1,326 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.xml;
+
+import com.ximpleware.NavException;
+import com.ximpleware.ParseException;
+import com.ximpleware.XPathEvalException;
+import com.ximpleware.XPathParseException;
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import jdk.nashorn.internal.ir.annotations.Ignore;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.text.similarity.JaccardSimilarity;
+import org.apache.commons.text.similarity.JaroWinklerSimilarity;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.text.Normalizer;
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+public class OrcidNoDoiTest {
+
+	private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
+
+	String nameA = "Khairy";
+	String surnameA = "Abdel Dayem";
+	String otherNameA = "Dayem MKA";
+	String nameB = "K";
+	String surnameB = "Abdel-Dayem";
+	String orcidIdA = "0000-0003-2760-1191";
+	Double threshold = 0.8;
+
+	@Test
+	@Ignore
+	private void similarityTest() throws Exception {
+		logger.info("running testSimilarity ....");
+		logger
+			.info(
+				"JaroWinklerSimilarity: "
+					+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
+		logger
+			.info(
+				"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
+	}
+
+	@Test
+	@Ignore
+	private void bestMatchTest() throws Exception {
+		logger.info("running bestMatchTest ....");
+		String contributor = surnameB + ", " + nameB;
+		logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
+	}
+
+	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
+		logger.debug(authorSurname + " " + authorName + " vs " + contributor);
+		String[] contributorSplitted = contributor.split(" ");
+		if (contributorSplitted.length == 0) {
+			return 0.0;
+		}
+		final String contributorName = contributorSplitted[contributorSplitted.length - 1];
+		String contributorSurname = "";
+		if (contributorSplitted.length > 1) {
+			StringJoiner joiner = new StringJoiner(" ");
+			for (int i = 0; i < contributorSplitted.length - 1; i++) {
+				joiner.add(contributorSplitted[i]);
+			}
+			contributorSurname = joiner.toString();
+		}
+		logger
+			.debug(
+				"contributorName: " + contributorName +
+					" contributorSurname: " + contributorSurname);
+		String authorNameNrm = normalize(authorName);
+		String authorSurnameNrm = normalize(authorSurname);
+		String contributorNameNrm = normalize(contributorName);
+		String contributorSurnameNrm = normalize(contributorSurname);
+		Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
+		Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
+		if (sm1.compareTo(sm2) >= 0) {
+			return sm1;
+		}
+		return sm2;
+	}
+
+	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
+		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
+		logger
+			.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + "   score: " + Double.toString(score));
+		return score;
+	}
+
+	private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
+		return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
+	}
+
+	private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
+		return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
+	}
+
+	private static String parse(String name, String surname) {
+		return surname + " " + name;
+	}
+
+	private static String normalize(final String s) {
+		return nfd(s)
+			.toLowerCase()
+			// do not compact the regexes in a single expression, would cause StackOverflowError
+			// in case
+			// of large input strings
+			.replaceAll("(\\W)+", " ")
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+			.replaceAll("(\\p{Punct})+", " ")
+			.replaceAll("(\\d)+", " ")
+			.replaceAll("(\\n)+", " ")
+			.trim();
+	}
+
+	private static String nfd(final String s) {
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
+	}
+
+	@Test
+	@Ignore
+	public void readPublicationFieldsTest()
+		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+		logger.info("running loadPublicationFieldsTest ....");
+		String xml = IOUtils
+			.toString(
+				OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
+
+		if (xml == null) {
+			logger.info("Resource not found");
+		}
+		XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+		if (p == null) {
+			logger.info("XMLRecordParserNoDoi null");
+		}
+		WorkDataNoDoi workData = null;
+		try {
+			workData = p.VTDParseWorkData(xml.getBytes());
+		} catch (Exception e) {
+			logger.error("parsing xml", e);
+		}
+		assertNotNull(workData);
+		assertNotNull(workData.getOid());
+		logger.info("oid: " + workData.getOid());
+		assertNotNull(workData.getTitles());
+		logger.info("titles: ");
+		workData.getTitles().forEach(t -> {
+			logger.info(t);
+		});
+		logger.info("source: " + workData.getSourceName());
+		logger.info("type: " + workData.getType());
+		logger.info("urls: ");
+		workData.getUrls().forEach(u -> {
+			logger.info(u);
+		});
+		logger.info("publication date: ");
+		workData.getPublicationDates().forEach(d -> {
+			logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay());
+		});
+		logger.info("external id: ");
+		workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self"));
+		workData.getExtIds().forEach(e -> {
+			logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip());
+		});
+		logger.info("contributors: ");
+		workData.getContributors().forEach(c -> {
+			logger
+				.info(
+					c.getName() + " - " + c.getRole() + " - " + c.getSequence());
+		});
+
+	}
+
+	private void updateRanks(List<Contributor> contributors) {
+		boolean seqFound = false;
+		if (contributors
+			.stream()
+			.filter(
+				c -> c.getRole() != null && c.getSequence() != null &&
+					c.getRole().equals("author") && (c.getSequence().equals("first") ||
+						c.getSequence().equals("additional")))
+			.count() > 0) {
+			seqFound = true;
+			logger.info("sequence data found");
+		}
+		if (!seqFound) {
+			List<Integer> seqIds = Arrays.asList(0);
+			contributors.forEach(c -> {
+				int currentSeq = seqIds.get(0) + 1;
+				seqIds.set(0, currentSeq);
+				c.setSequence(Integer.toString(seqIds.get(0)));
+			});
+		}
+	}
+
+	private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
+		contributors.forEach(c -> {
+			if (c.isSimpleMatch()) {
+				logger.info("simple match on : " + c.getCreditName());
+				c.setName(author.getName());
+				c.setSurname(author.getSurname());
+				c.setOid(author.getOid());
+			}
+		});
+		updateRanks(contributors);
+	}
+
+	private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
+		logger.info("inside updateAuthorsSimilarityMatch ...");
+		contributors.forEach(c -> {
+			logger
+				.info(
+					c.getOid() + " - " + c.getCreditName() + " - " +
+						c.getName() + " - " + c.getSurname() + " - " +
+						c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+						+ c.isSimpleMatch());
+		});
+
+		contributors
+			.stream()
+			.filter(c -> c.isBestMatch())
+			.forEach(c -> {
+				logger.info("similarity match on : " + c.getCreditName());
+				c.setName(author.getName());
+				c.setSurname(author.getSurname());
+				c.setOid(author.getOid());
+			});
+		updateRanks(contributors);
+	}
+
+	@Test
+	@Ignore
+	public void authorSimilarityMatchTest() throws Exception {
+		logger.info("running authorSimilarityMatchTest ....");
+		authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
+	}
+
+	@Test
+	private void authorSimpleMatchTest() throws Exception {
+		logger.info("running authorSimpleMatchTest ....");
+		authorMatchTest("activity_work_0000-0003-2760-1191.xml");
+	}
+
+	private void authorMatchTest(String orcidWork)
+		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+		AuthorData author = new AuthorData();
+		author.setName(nameA);
+		author.setSurname(surnameA);
+		author.setOid(orcidIdA);
+		String xml = IOUtils
+			.toString(
+				OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
+
+		if (xml == null) {
+			logger.info("Resource not found");
+		}
+		XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+		if (p == null) {
+			logger.info("XMLRecordParserNoDoi null");
+		}
+		WorkDataNoDoi workData = null;
+		try {
+			workData = p.VTDParseWorkData(xml.getBytes());
+		} catch (Exception e) {
+			logger.error("parsing xml", e);
+		}
+		assertNotNull(workData);
+		int matchCounter = 0;
+		List<Integer> matchCounters = Arrays.asList(matchCounter);
+		Contributor contributor = null;
+		workData.getContributors().forEach(c -> {
+			if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
+				normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
+				((author.getOtherName() != null)
+					&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
+				matchCounters.set(0, matchCounters.get(0) + 1);
+				c.setSimpleMatch(true);
+			}
+		});
+		logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
+		if (matchCounters.get(0) == 1) {
+			updateAuthorsSimpleMatch(workData.getContributors(), author);
+		} else if (matchCounters.get(0) > 1) {
+			Optional<Contributor> optCon = workData
+				.getContributors()
+				.stream()
+				.filter(c -> c.isSimpleMatch())
+				.map(c -> {
+					c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
+					logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
+					return c;
+				})
+				.filter(c -> c.getScore() >= threshold)
+				.max(Comparator.comparing(c -> c.getScore()));
+			Contributor bestMatchContributor = null;
+			if (optCon.isPresent()) {
+				bestMatchContributor = optCon.get();
+				bestMatchContributor.setBestMatch(true);
+				logger.info("best match: " + bestMatchContributor.getCreditName());
+				updateAuthorsSimilarityMatch(workData.getContributors(), author);
+			}
+
+		}
+
+		logger.info("UPDATED contributors: ");
+		workData.getContributors().forEach(c -> {
+			logger
+				.info(
+					c.getOid() + " - " + c.getCreditName() + " - " +
+						c.getName() + " - " + c.getSurname() + " - " +
+						c.getRole() + " - " + c.getSequence());
+		});
+	}
+}
+
+//
+//		orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
+//		no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
+//
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml
new file mode 100644
index 000000000..485f4f8e8
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0003-2760-1191.xml
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<work:work xmlns:address="http://www.orcid.org/ns/address"
+           xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
+           xmlns:employment="http://www.orcid.org/ns/employment"
+           xmlns:education="http://www.orcid.org/ns/education"
+           xmlns:other-name="http://www.orcid.org/ns/other-name"
+           xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+           xmlns:funding="http://www.orcid.org/ns/funding"
+           xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+           xmlns:service="http://www.orcid.org/ns/service"
+           xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+           xmlns:distinction="http://www.orcid.org/ns/distinction"
+           xmlns:internal="http://www.orcid.org/ns/internal"
+           xmlns:membership="http://www.orcid.org/ns/membership"
+           xmlns:person="http://www.orcid.org/ns/person"
+           xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+           xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+           xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+           xmlns:activities="http://www.orcid.org/ns/activities"
+           xmlns:qualification="http://www.orcid.org/ns/qualification"
+           xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+           xmlns:error="http://www.orcid.org/ns/error"
+           xmlns:preferences="http://www.orcid.org/ns/preferences"
+           xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+           xmlns:work="http://www.orcid.org/ns/work"
+           xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
+           path="/0000-0003-2760-1191/work/28776099" visibility="public">
+    <common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
+    <common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
+    <common:source>
+        <common:source-orcid>
+            <common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
+            <common:path>0000-0002-9157-3431</common:path>
+            <common:host>orcid.org</common:host>
+        </common:source-orcid>
+        <common:source-name>Europe PubMed Central</common:source-name>
+    </common:source>
+    <work:title>
+        <common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+            Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+            ST-Segment-Elevation Myocardial Infarction.</common:title>
+    </work:title>
+    <work:citation>
+        <work:citation-type>formatted-unspecified</work:citation-type>
+        <work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+            Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
+    </work:citation>
+    <work:type>journal-article</work:type>
+    <common:publication-date>
+        <common:year>2016</common:year>
+        <common:month>11</common:month>
+    </common:publication-date>
+    <common:external-ids>
+        <common:external-id>
+            <common:external-id-type>pmid</common:external-id-type>
+            <common:external-id-value>27899851</common:external-id-value>
+            <common:external-id-normalized transient="true">27899851</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+        <common:external-id>
+            <common:external-id-type>pmc</common:external-id-type>
+            <common:external-id-value>PMC5126442</common:external-id-value>
+            <common:external-id-normalized transient="true"
+            >PMC5126442</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+    </common:external-ids>
+    <common:url>http://europepmc.org/abstract/med/27899851</common:url>
+    <work:contributors>
+        <work:contributor>
+            <work:credit-name>Abdel-Dayem K</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Eweda II</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>El-Sherbiny A</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Dimitry MO</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Nammas W</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+    </work:contributors>
+</work:work>
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml
similarity index 100%
rename from dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml
rename to dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-5982-8983.xml
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml
new file mode 100644
index 000000000..650d5a4cb
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191-similarity.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<work:work xmlns:address="http://www.orcid.org/ns/address"
+           xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
+           xmlns:employment="http://www.orcid.org/ns/employment"
+           xmlns:education="http://www.orcid.org/ns/education"
+           xmlns:other-name="http://www.orcid.org/ns/other-name"
+           xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+           xmlns:funding="http://www.orcid.org/ns/funding"
+           xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+           xmlns:service="http://www.orcid.org/ns/service"
+           xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+           xmlns:distinction="http://www.orcid.org/ns/distinction"
+           xmlns:internal="http://www.orcid.org/ns/internal"
+           xmlns:membership="http://www.orcid.org/ns/membership"
+           xmlns:person="http://www.orcid.org/ns/person"
+           xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+           xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+           xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+           xmlns:activities="http://www.orcid.org/ns/activities"
+           xmlns:qualification="http://www.orcid.org/ns/qualification"
+           xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+           xmlns:error="http://www.orcid.org/ns/error"
+           xmlns:preferences="http://www.orcid.org/ns/preferences"
+           xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+           xmlns:work="http://www.orcid.org/ns/work"
+           xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
+           path="/0000-0003-2760-1191/work/28776099" visibility="public">
+    <common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
+    <common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
+    <common:source>
+        <common:source-orcid>
+            <common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
+            <common:path>0000-0002-9157-3431</common:path>
+            <common:host>orcid.org</common:host>
+        </common:source-orcid>
+        <common:source-name>Europe PubMed Central</common:source-name>
+    </common:source>
+    <work:title>
+        <common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+            Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+            ST-Segment-Elevation Myocardial Infarction.</common:title>
+    </work:title>
+    <work:citation>
+        <work:citation-type>formatted-unspecified</work:citation-type>
+        <work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+            Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
+    </work:citation>
+    <work:type>journal-article</work:type>
+    <common:publication-date>
+        <common:year>2016</common:year>
+        <common:month>11</common:month>
+    </common:publication-date>
+    <common:external-ids>
+        <common:external-id>
+            <common:external-id-type>pmid</common:external-id-type>
+            <common:external-id-value>27899851</common:external-id-value>
+            <common:external-id-normalized transient="true">27899851</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+        <common:external-id>
+            <common:external-id-type>pmc</common:external-id-type>
+            <common:external-id-value>PMC5126442</common:external-id-value>
+            <common:external-id-normalized transient="true"
+            >PMC5126442</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+    </common:external-ids>
+    <common:url>http://europepmc.org/abstract/med/27899851</common:url>
+    <work:contributors>
+        <work:contributor>
+            <work:credit-name>Abdel-Dayem K</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Abdel-Dayem Fake</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Eweda II</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>El-Sherbiny A</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Dimitry MO</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Nammas W</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+    </work:contributors>
+</work:work>
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
new file mode 100644
index 000000000..485f4f8e8
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<work:work xmlns:address="http://www.orcid.org/ns/address"
+           xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
+           xmlns:employment="http://www.orcid.org/ns/employment"
+           xmlns:education="http://www.orcid.org/ns/education"
+           xmlns:other-name="http://www.orcid.org/ns/other-name"
+           xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+           xmlns:funding="http://www.orcid.org/ns/funding"
+           xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+           xmlns:service="http://www.orcid.org/ns/service"
+           xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+           xmlns:distinction="http://www.orcid.org/ns/distinction"
+           xmlns:internal="http://www.orcid.org/ns/internal"
+           xmlns:membership="http://www.orcid.org/ns/membership"
+           xmlns:person="http://www.orcid.org/ns/person"
+           xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+           xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+           xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+           xmlns:activities="http://www.orcid.org/ns/activities"
+           xmlns:qualification="http://www.orcid.org/ns/qualification"
+           xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+           xmlns:error="http://www.orcid.org/ns/error"
+           xmlns:preferences="http://www.orcid.org/ns/preferences"
+           xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+           xmlns:work="http://www.orcid.org/ns/work"
+           xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
+           path="/0000-0003-2760-1191/work/28776099" visibility="public">
+    <common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
+    <common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
+    <common:source>
+        <common:source-orcid>
+            <common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
+            <common:path>0000-0002-9157-3431</common:path>
+            <common:host>orcid.org</common:host>
+        </common:source-orcid>
+        <common:source-name>Europe PubMed Central</common:source-name>
+    </common:source>
+    <work:title>
+        <common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+            Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+            ST-Segment-Elevation Myocardial Infarction.</common:title>
+    </work:title>
+    <work:citation>
+        <work:citation-type>formatted-unspecified</work:citation-type>
+        <work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+            Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
+    </work:citation>
+    <work:type>journal-article</work:type>
+    <common:publication-date>
+        <common:year>2016</common:year>
+        <common:month>11</common:month>
+    </common:publication-date>
+    <common:external-ids>
+        <common:external-id>
+            <common:external-id-type>pmid</common:external-id-type>
+            <common:external-id-value>27899851</common:external-id-value>
+            <common:external-id-normalized transient="true">27899851</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+        <common:external-id>
+            <common:external-id-type>pmc</common:external-id-type>
+            <common:external-id-value>PMC5126442</common:external-id-value>
+            <common:external-id-normalized transient="true"
+            >PMC5126442</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+    </common:external-ids>
+    <common:url>http://europepmc.org/abstract/med/27899851</common:url>
+    <work:contributors>
+        <work:contributor>
+            <work:credit-name>Abdel-Dayem K</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Eweda II</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>El-Sherbiny A</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Dimitry MO</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>Nammas W</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>first</work:contributor-sequence>
+                <work:contributor-role>author</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+    </work:contributors>
+</work:work>

From d6498278edc87aeb15ee61b33edf7f280829b56a Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 25 Jun 2020 18:43:29 +0200
Subject: [PATCH 02/34] added workflow to generate seq(orcidId,work) and
 seq(orcidId,enrichedWork)

---
 .../orcid/ActivitiesDecompressor.java         |   2 +-
 .../doiboost/orcid/SummariesDecompressor.java |   2 +-
 .../doiboost/orcid/json/JsonHelper.java       |  16 +
 .../orcidnodoi/ActivitiesDumpReader.java      | 149 +++++
 .../orcidnodoi/GenOrcidAuthorWork.java        |  52 ++
 .../SparkGenEnrichedOrcidWorks.java           | 119 ++++
 .../json/JsonWriter.java                      |   2 +-
 .../orcidnodoi/model/Contributor.java         |   6 +-
 .../orcidnodoi/model/WorkDataNoDoi.java       |   1 -
 .../orcidnodoi/similarity/AuthorMatcher.java  | 204 +++++++
 .../oozie_app/config-default.xml              |  22 +
 .../oozie_app/workflow.xml                    | 524 ++++++++++++++++++
 .../gen_enriched_orcid_works_parameters.json  |   7 +
 .../orcidnodoi/xml/OrcidNoDoiTest.java        | 250 +--------
 14 files changed, 1125 insertions(+), 231 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
 rename dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/{orcid => orcidnodoi}/json/JsonWriter.java (94%)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
index 570fdef17..80ccd71a1 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
@@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
-import eu.dnetlib.doiboost.orcid.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
index f0bbb5c32..603bfedf6 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
@@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
-import eu.dnetlib.doiboost.orcid.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
new file mode 100644
index 000000000..13a3cee8f
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
@@ -0,0 +1,16 @@
+
+package eu.dnetlib.doiboost.orcid.json;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonObject;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+
+public class JsonHelper {
+
+	public static String createOidWork(WorkDataNoDoi workData) {
+		JsonObject oidWork = new JsonObject();
+		oidWork.addProperty("oid", workData.getOid());
+		oidWork.addProperty("work", new Gson().toJson(workData));
+		return oidWork.toString();
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
new file mode 100644
index 000000000..7eb6faf54
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -0,0 +1,149 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import eu.dnetlib.doiboost.orcid.json.JsonHelper;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.mortbay.log.Log;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
+
+public class ActivitiesDumpReader {
+
+	private static final int MAX_XML_WORKS_PARSED = -1;
+	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
+
+	public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
+		throws Exception {
+		String uri = inputUri;
+		FileSystem fs = FileSystem.get(URI.create(uri), conf);
+		Path inputPath = new Path(uri);
+		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+		CompressionCodec codec = factory.getCodec(inputPath);
+		if (codec == null) {
+			System.err.println("No codec found for " + uri);
+			System.exit(1);
+		}
+		CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
+		InputStream gzipInputStream = null;
+		try {
+			gzipInputStream = codec.createInputStream(fs.open(inputPath));
+			parseTarActivities(fs, conf, gzipInputStream, outputPath);
+
+		} finally {
+			Log.debug("Closing gzip stream");
+			IOUtils.closeStream(gzipInputStream);
+		}
+	}
+
+	private static void parseTarActivities(
+		FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
+		int counter = 0;
+		int noDoiFound = 0;
+		int errorFromOrcidFound = 0;
+		int xmlParserErrorFound = 0;
+		try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
+			TarArchiveEntry entry = null;
+
+			try (SequenceFile.Writer writer = SequenceFile
+				.createWriter(
+					conf,
+					SequenceFile.Writer.file(outputPath),
+					SequenceFile.Writer.keyClass(Text.class),
+					SequenceFile.Writer.valueClass(Text.class))) {
+				while ((entry = tais.getNextTarEntry()) != null) {
+					String filename = entry.getName();
+
+					try {
+						if (entry.isDirectory() || !filename.contains("works")) {
+
+						} else {
+							Log.debug("XML work entry name: " + entry.getName());
+							counter++;
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
+																									// tarInput
+							String line;
+							StringBuffer buffer = new StringBuffer();
+							while ((line = br.readLine()) != null) {
+								buffer.append(line);
+							}
+							WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
+							if (workDataNoDoi != null) {
+								if (workDataNoDoi.getErrorCode() != null) {
+									errorFromOrcidFound += 1;
+									Log
+										.debug(
+											"error from Orcid with code "
+												+ workDataNoDoi.getErrorCode()
+												+ " for entry "
+												+ entry.getName());
+									continue;
+								}
+								boolean isDoiFound = workDataNoDoi.getExtIds().stream()
+										.filter(e -> e.getType()!=null)
+										.anyMatch(e -> e.getType().equals("doi"));
+								if (!isDoiFound) {
+									String jsonData = JsonHelper.createOidWork(workDataNoDoi);
+									Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
+
+									final Text key = new Text(workDataNoDoi.getOid());
+									final Text value = new Text(jsonData);
+
+									try {
+										writer.append(key, value);
+									} catch (IOException e) {
+										Log.debug("Writing to sequence file: " + e.getMessage());
+										Log.debug(e);
+										throw new RuntimeException(e);
+									}
+									noDoiFound += 1;
+								}
+
+							} else {
+								Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
+								xmlParserErrorFound += 1;
+							}
+						}
+					} catch (Exception e) {
+						Log
+							.warn(
+								"Parsing work from tar archive and xml work: " + filename + "  " + e.getMessage());
+						Log.warn(e);
+					}
+
+					if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
+						Log.info("Current xml works parsed: " + counter);
+					}
+
+					if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
+						break;
+					}
+				}
+			}
+		} catch (IOException e) {
+			Log.warn("Parsing work from gzip archive: " + e.getMessage());
+			Log.warn(e);
+			throw new RuntimeException(e);
+		}
+		Log.info("Activities parse completed");
+		Log.info("Total XML works parsed: " + counter);
+		Log.info("Total no doi work found: " + noDoiFound);
+		Log.info("Error from Orcid found: " + errorFromOrcidFound);
+		Log.info("Error parsing xml work found: " + xmlParserErrorFound);
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
new file mode 100644
index 000000000..b82f4bc4c
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -0,0 +1,52 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.OrcidDSManager;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.mortbay.log.Log;
+
+import java.io.IOException;
+
+public class GenOrcidAuthorWork extends OrcidDSManager {
+
+	private String activitiesFileNameTarGz;
+	private String outputWorksPath;
+	private String workingPath;
+
+	public static void main(String[] args) throws IOException, Exception {
+		GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
+		genOrcidAuthorWork.loadArgs(args);
+		genOrcidAuthorWork.generateAuthorsDOIsData();
+	}
+
+	public void generateAuthorsDOIsData() throws Exception {
+		Configuration conf = initConfigurationObject();
+		FileSystem fs = initFileSystemObject(conf);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
+		Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
+		ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
+	}
+
+	private void loadArgs(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenOrcidAuthorWork.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
+		parser.parseArgument(args);
+
+		hdfsServerUri = parser.get("hdfsServerUri");
+		Log.info("HDFS URI: " + hdfsServerUri);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
+		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
+		Log.info("Activities File Name: " + activitiesFileNameTarGz);
+		outputWorksPath = parser.get("outputWorksPath");
+		Log.info("Output Author Work Data: " + outputWorksPath);
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
new file mode 100644
index 000000000..6bb31bcf6
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -0,0 +1,119 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+public class SparkGenEnrichedOrcidWorks {
+
+	public static void main(String[] args) throws IOException, Exception {
+		Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
+		logger.info("[ SparkGenerateDoiAuthorList STARTED]");
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkGenEnrichedOrcidWorks.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+		final String workingPath = parser.get("workingPath");
+		logger.info("workingPath: ", workingPath);
+		final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
+		logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
+		final String outputWorksPath = parser.get("outputWorksPath");
+		logger.info("outputWorksPath: ", outputWorksPath);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+				JavaPairRDD<Text, Text> summariesRDD = sc
+					.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
+				Dataset<AuthorData> summariesDataset = spark
+					.createDataset(
+						summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
+						Encoders.bean(AuthorData.class));
+
+				JavaPairRDD<Text, Text> activitiesRDD = sc
+					.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
+				Dataset<WorkDataNoDoi> activitiesDataset = spark
+					.createDataset(
+						activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
+						Encoders.bean(WorkDataNoDoi.class));
+
+				activitiesDataset
+						.joinWith(
+								summariesDataset,
+								activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
+						.map(
+								(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
+									WorkDataNoDoi w = value._1;
+									AuthorData a = value._2;
+									AuthorMatcher.match(a, w.getContributors());
+									return new Tuple2<>(a.getOid(), w);
+								},
+								Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
+						.filter(Objects::nonNull)
+						.toJavaRDD()
+						.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
+			});
+	}
+
+	private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
+		AuthorData authorData = new AuthorData();
+		authorData.setOid(orcidId.toString());
+		JsonElement jElement = new JsonParser().parse(json.toString());
+		authorData.setName(getJsonValue(jElement, "name"));
+		authorData.setSurname(getJsonValue(jElement, "surname"));
+		authorData.setCreditName(getJsonValue(jElement, "creditname"));
+		return authorData;
+	}
+
+	private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
+		WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
+		return workData;
+	}
+
+	private static String getJsonValue(JsonElement jElement, String property) {
+		if (jElement.getAsJsonObject().has(property)) {
+			JsonElement name = null;
+			name = jElement.getAsJsonObject().get(property);
+			if (name != null && !name.isJsonNull()) {
+				return name.getAsString();
+			}
+		}
+		return null;
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
similarity index 94%
rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java
rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
index 35676d5ba..7f7e3a10a 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
@@ -1,5 +1,5 @@
 
-package eu.dnetlib.doiboost.orcid.json;
+package eu.dnetlib.doiboost.orcidnodoi.json;
 
 import com.google.gson.JsonObject;
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
index 42076de5d..8a170de09 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
@@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
 public class Contributor extends AuthorData implements Serializable {
 	private String sequence;
 	private String role;
-	private boolean simpleMatch = false;
-	private Double score = 0.0;
-	private boolean bestMatch = false;
+	private transient boolean simpleMatch = false;
+	private transient Double score = 0.0;
+	private transient boolean bestMatch = false;
 
 	public String getSequence() {
 		return sequence;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
index ee13454e1..5756521e7 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
@@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable {
 	public void setContributors(List<Contributor> contributors) {
 		this.contributors = contributors;
 	}
-
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
new file mode 100644
index 000000000..09fd8b36b
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -0,0 +1,204 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.similarity;
+
+import java.io.IOException;
+import java.text.Normalizer;
+import java.util.*;
+
+import org.apache.commons.text.similarity.JaroWinklerSimilarity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.ximpleware.NavException;
+import com.ximpleware.ParseException;
+import com.ximpleware.XPathEvalException;
+import com.ximpleware.XPathParseException;
+
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+
+public class AuthorMatcher {
+
+	private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
+	private static final Double threshold = 0.8;
+
+	public static void match(AuthorData author, List<Contributor> contributors)
+		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+
+		int matchCounter = 0;
+		List<Integer> matchCounters = Arrays.asList(matchCounter);
+		Contributor contributor = null;
+		contributors.forEach(c -> {
+			if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
+				normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
+				((author.getOtherName() != null)
+					&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
+				matchCounters.set(0, matchCounters.get(0) + 1);
+				c.setSimpleMatch(true);
+			}
+		});
+		logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
+		if (matchCounters.get(0) == 1) {
+			updateAuthorsSimpleMatch(contributors, author);
+		} else if (matchCounters.get(0) > 1) {
+			Optional<Contributor> optCon = contributors
+				.stream()
+				.filter(c -> c.isSimpleMatch())
+				.map(c -> {
+					c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
+					logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
+					return c;
+				})
+				.filter(c -> c.getScore() >= threshold)
+				.max(Comparator.comparing(c -> c.getScore()));
+			Contributor bestMatchContributor = null;
+			if (optCon.isPresent()) {
+				bestMatchContributor = optCon.get();
+				bestMatchContributor.setBestMatch(true);
+				logger.info("best match: " + bestMatchContributor.getCreditName());
+				updateAuthorsSimilarityMatch(contributors, author);
+			}
+
+		}
+
+		logger.info("UPDATED contributors: ");
+		contributors.forEach(c -> {
+			logger
+				.info(
+					c.getOid() + " - " + c.getCreditName() + " - " +
+						c.getName() + " - " + c.getSurname() + " - " +
+						c.getRole() + " - " + c.getSequence());
+		});
+	}
+
+	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
+		logger.debug(authorSurname + " " + authorName + " vs " + contributor);
+		String[] contributorSplitted = contributor.split(" ");
+		if (contributorSplitted.length == 0) {
+			return 0.0;
+		}
+		final String contributorName = contributorSplitted[contributorSplitted.length - 1];
+		String contributorSurname = "";
+		if (contributorSplitted.length > 1) {
+			StringJoiner joiner = new StringJoiner(" ");
+			for (int i = 0; i < contributorSplitted.length - 1; i++) {
+				joiner.add(contributorSplitted[i]);
+			}
+			contributorSurname = joiner.toString();
+		}
+		logger
+			.debug(
+				"contributorName: " + contributorName +
+					" contributorSurname: " + contributorSurname);
+		String authorNameNrm = normalize(authorName);
+		String authorSurnameNrm = normalize(authorSurname);
+		String contributorNameNrm = normalize(contributorName);
+		String contributorSurnameNrm = normalize(contributorSurname);
+		Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
+		Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
+		if (sm1.compareTo(sm2) >= 0) {
+			return sm1;
+		}
+		return sm2;
+	}
+
+	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
+		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
+		logger
+			.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + "   score: " + Double.toString(score));
+		return score;
+	}
+
+	private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
+		return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
+	}
+
+	private static String normalize(final String s) {
+		return nfd(s)
+			.toLowerCase()
+			// do not compact the regexes in a single expression, would cause StackOverflowError
+			// in case
+			// of large input strings
+			.replaceAll("(\\W)+", " ")
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+			.replaceAll("(\\p{Punct})+", " ")
+			.replaceAll("(\\d)+", " ")
+			.replaceAll("(\\n)+", " ")
+			.trim();
+	}
+
+	private static String nfd(final String s) {
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
+	}
+
+	private static String parse(String name, String surname) {
+		return surname + " " + name;
+	}
+
+	private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
+		contributors.forEach(c -> {
+			if (c.isSimpleMatch()) {
+				logger.info("simple match on : " + c.getCreditName());
+				c.setName(author.getName());
+				c.setSurname(author.getSurname());
+				c.setOid(author.getOid());
+			}
+		});
+		updateRanks(contributors);
+	}
+
+	private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
+		logger.info("inside updateAuthorsSimilarityMatch ...");
+		contributors.forEach(c -> {
+			logger
+				.info(
+					c.getOid() + " - " + c.getCreditName() + " - " +
+						c.getName() + " - " + c.getSurname() + " - " +
+						c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+						+ c.isSimpleMatch());
+		});
+
+		contributors
+			.stream()
+			.filter(c -> c.isBestMatch())
+			.forEach(c -> {
+				logger.info("similarity match on : " + c.getCreditName());
+				c.setName(author.getName());
+				c.setSurname(author.getSurname());
+				c.setOid(author.getOid());
+			});
+		updateRanks(contributors);
+	}
+
+	private static void updateRanks(List<Contributor> contributors) {
+		boolean seqFound = false;
+		if (contributors
+			.stream()
+			.filter(
+				c -> c.getRole() != null && c.getSequence() != null &&
+					c.getRole().equals("author") && (c.getSequence().equals("first") ||
+						c.getSequence().equals("additional")))
+			.count() > 0) {
+			seqFound = true;
+			logger.info("sequence data found");
+		}
+		if (!seqFound) {
+			List<Integer> seqIds = Arrays.asList(0);
+			contributors.forEach(c -> {
+				int currentSeq = seqIds.get(0) + 1;
+				seqIds.set(0, currentSeq);
+				c.setSequence(Integer.toString(seqIds.get(0)));
+			});
+		}
+	}
+
+	private static String toJson(WorkDataNoDoi work) {
+		GsonBuilder builder = new GsonBuilder();
+		Gson gson = builder.create();
+		return gson.toJson(work);
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
new file mode 100644
index 000000000..f2d51e260
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
@@ -0,0 +1,22 @@
+<configuration>
+    <property>
+        <name>oozie.action.sharelib.for.java</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+      <name>oozie.launcher.mapreduce.map.java.opts</name>
+      <value>-Xmx4g</value>
+    </property>
+    <property>
+        <name>jobTracker</name>
+        <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
new file mode 100644
index 000000000..2486bdb24
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
@@ -0,0 +1,524 @@
+<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath_activities</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>shell_cmd_0</name>
+            <value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
+        </property>
+        <property>
+            <name>shell_cmd_1</name>
+            <value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
+        </property>
+        <property>
+            <name>shell_cmd_2</name>
+            <value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
+        </property>
+        <property>
+            <name>shell_cmd_3</name>
+            <value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
+        </property> 
+        <property>
+            <name>shell_cmd_4</name>
+            <value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
+        </property> 
+        <property>
+            <name>shell_cmd_5</name>
+            <value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
+        </property>  
+        <property>
+            <name>shell_cmd_6</name>
+            <value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
+        </property>
+        <property>
+            <name>shell_cmd_7</name>
+            <value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
+        </property>
+        <property>
+            <name>shell_cmd_8</name>
+            <value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
+        </property>
+        <property>
+            <name>shell_cmd_9</name>
+            <value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
+        </property> 
+        <property>
+            <name>shell_cmd_X</name>
+            <value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
+        </property>  
+    </parameters>
+    
+    <start to="ResetWorkingPath"/>
+    
+    
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath_activities}/no_doi_works/*'/>
+            <delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
+        </fs>
+        <ok to="fork_gen_orcid_author_work"/>
+        <error to="Kill"/>
+    </action>
+    
+    <fork name = "fork_gen_orcid_author_work">
+      <path start = "check_exist_on_hdfs_activities_0"/>
+      <path start = "check_exist_on_hdfs_activities_1"/>
+      <path start = "check_exist_on_hdfs_activities_2"/>
+	  <path start = "check_exist_on_hdfs_activities_3"/>
+	  <path start = "check_exist_on_hdfs_activities_4"/>
+	  <path start = "check_exist_on_hdfs_activities_5"/>
+	  <path start = "check_exist_on_hdfs_activities_6"/>
+	  <path start = "check_exist_on_hdfs_activities_7"/>
+	  <path start = "check_exist_on_hdfs_activities_8"/>
+	  <path start = "check_exist_on_hdfs_activities_9"/>
+	  <path start = "check_exist_on_hdfs_activities_X"/>
+   	</fork>
+   	
+    <decision name="check_exist_on_hdfs_activities_0">
+         <switch>
+            <case to="GenOrcidAuthorWork_0">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
+            </case>
+            <default to="Download_0" />
+         </switch>
+	</decision>
+	
+    <action name="Download_0">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_0}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_0"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_0">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_1">
+         <switch>
+            <case to="GenOrcidAuthorWork_1">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
+            </case>
+            <default to="Download_1" />
+         </switch>
+	</decision>
+	
+    <action name="Download_1">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_1}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_1"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_1">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_2">
+         <switch>
+            <case to="GenOrcidAuthorWork_2">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
+            </case>
+            <default to="Download_2" />
+         </switch>
+	</decision>
+	
+    <action name="Download_2">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_2}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_2"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_2">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_3">
+         <switch>
+            <case to="GenOrcidAuthorWork_3">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
+            </case>
+            <default to="Download_3" />
+         </switch>
+	</decision>
+	
+    <action name="Download_3">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_3}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_3"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_3">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_4">
+         <switch>
+            <case to="GenOrcidAuthorWork_4">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
+            </case>
+            <default to="Download_4" />
+         </switch>
+	</decision>
+	
+    <action name="Download_4">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_4}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_4"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_4">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_5">
+         <switch>
+            <case to="GenOrcidAuthorWork_5">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
+            </case>
+            <default to="Download_5" />
+         </switch>
+	</decision>
+	
+    <action name="Download_5">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_5}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_5"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_5">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_6">
+         <switch>
+            <case to="GenOrcidAuthorWork_6">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
+            </case>
+            <default to="Download_6" />
+         </switch>
+	</decision>
+	
+    <action name="Download_6">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_6}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_6"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_6">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    
+    <decision name="check_exist_on_hdfs_activities_7">
+         <switch>
+            <case to="GenOrcidAuthorWork_7">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
+            </case>
+            <default to="Download_7" />
+         </switch>
+	</decision>
+	
+    <action name="Download_7">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_7}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_7"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_7">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_8">
+         <switch>
+            <case to="GenOrcidAuthorWork_8">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
+            </case>
+            <default to="Download_8" />
+         </switch>
+	</decision>
+	
+    <action name="Download_8">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_8}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_8"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_8">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_9">
+         <switch>
+            <case to="GenOrcidAuthorWork_9">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
+            </case>
+            <default to="Download_9" />
+         </switch>
+	</decision>
+	
+    <action name="Download_9">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_9}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_9"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_9">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_X">
+         <switch>
+            <case to="GenOrcidAuthorWork_X">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
+            </case>
+            <default to="Download_X" />
+         </switch>
+	</decision>
+	
+    <action name="Download_X">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_X}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_X"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_X">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
+
+    <action name="Gen_Enriched_Orcid_Works">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Gen_Enriched_Orcid_Works</name>
+            <class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
+            <jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
+            <spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
+            </spark-opts>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-ow</arg><arg>no_doi_works/</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    
+   <end name="End"/>
+</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json
new file mode 100644
index 000000000..c3a8f92ec
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json
@@ -0,0 +1,7 @@
+[
+ {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
+ {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
+ {"paramName":"f",   "paramLongName":"activitiesFileNameTarGz",	"paramDescription": "the name of the activities orcid file",	"paramRequired": true},
+ {"paramName":"ow",   "paramLongName":"outputWorksPath",	"paramDescription": "the relative folder of the sequencial file to write",	"paramRequired": true},
+ {"paramName":"oew",   "paramLongName":"outputEnrichedWorksPath",	"paramDescription": "the relative folder of the sequencial file to write the data",	"paramRequired": true}
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index 31f8432ac..6a5faddbd 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -1,15 +1,12 @@
 
 package eu.dnetlib.doiboost.orcidnodoi.xml;
 
-import com.ximpleware.NavException;
-import com.ximpleware.ParseException;
-import com.ximpleware.XPathEvalException;
-import com.ximpleware.XPathParseException;
-import eu.dnetlib.dhp.parser.utility.VtdException;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
-import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
-import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
-import jdk.nashorn.internal.ir.annotations.Ignore;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.IOException;
+import java.text.Normalizer;
+import java.util.*;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.text.similarity.JaccardSimilarity;
 import org.apache.commons.text.similarity.JaroWinklerSimilarity;
@@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.text.Normalizer;
-import java.util.*;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.ximpleware.NavException;
+import com.ximpleware.ParseException;
+import com.ximpleware.XPathEvalException;
+import com.ximpleware.XPathParseException;
 
-import static org.junit.jupiter.api.Assertions.assertNotNull;
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
+import jdk.nashorn.internal.ir.annotations.Ignore;
 
 public class OrcidNoDoiTest {
 
@@ -33,100 +39,10 @@ public class OrcidNoDoiTest {
 	String nameB = "K";
 	String surnameB = "Abdel-Dayem";
 	String orcidIdA = "0000-0003-2760-1191";
-	Double threshold = 0.8;
 
 	@Test
 	@Ignore
-	private void similarityTest() throws Exception {
-		logger.info("running testSimilarity ....");
-		logger
-			.info(
-				"JaroWinklerSimilarity: "
-					+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
-		logger
-			.info(
-				"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
-	}
-
-	@Test
-	@Ignore
-	private void bestMatchTest() throws Exception {
-		logger.info("running bestMatchTest ....");
-		String contributor = surnameB + ", " + nameB;
-		logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
-	}
-
-	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
-		logger.debug(authorSurname + " " + authorName + " vs " + contributor);
-		String[] contributorSplitted = contributor.split(" ");
-		if (contributorSplitted.length == 0) {
-			return 0.0;
-		}
-		final String contributorName = contributorSplitted[contributorSplitted.length - 1];
-		String contributorSurname = "";
-		if (contributorSplitted.length > 1) {
-			StringJoiner joiner = new StringJoiner(" ");
-			for (int i = 0; i < contributorSplitted.length - 1; i++) {
-				joiner.add(contributorSplitted[i]);
-			}
-			contributorSurname = joiner.toString();
-		}
-		logger
-			.debug(
-				"contributorName: " + contributorName +
-					" contributorSurname: " + contributorSurname);
-		String authorNameNrm = normalize(authorName);
-		String authorSurnameNrm = normalize(authorSurname);
-		String contributorNameNrm = normalize(contributorName);
-		String contributorSurnameNrm = normalize(contributorSurname);
-		Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
-		Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
-		if (sm1.compareTo(sm2) >= 0) {
-			return sm1;
-		}
-		return sm2;
-	}
-
-	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
-		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
-		logger
-			.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + "   score: " + Double.toString(score));
-		return score;
-	}
-
-	private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
-		return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
-	}
-
-	private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
-		return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
-	}
-
-	private static String parse(String name, String surname) {
-		return surname + " " + name;
-	}
-
-	private static String normalize(final String s) {
-		return nfd(s)
-			.toLowerCase()
-			// do not compact the regexes in a single expression, would cause StackOverflowError
-			// in case
-			// of large input strings
-			.replaceAll("(\\W)+", " ")
-			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-			.replaceAll("(\\p{Punct})+", " ")
-			.replaceAll("(\\d)+", " ")
-			.replaceAll("(\\n)+", " ")
-			.trim();
-	}
-
-	private static String nfd(final String s) {
-		return Normalizer.normalize(s, Normalizer.Form.NFD);
-	}
-
-	@Test
-	@Ignore
-	public void readPublicationFieldsTest()
+	private void readPublicationFieldsTest()
 		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
 		logger.info("running loadPublicationFieldsTest ....");
 		String xml = IOUtils
@@ -178,78 +94,10 @@ public class OrcidNoDoiTest {
 
 	}
 
-	private void updateRanks(List<Contributor> contributors) {
-		boolean seqFound = false;
-		if (contributors
-			.stream()
-			.filter(
-				c -> c.getRole() != null && c.getSequence() != null &&
-					c.getRole().equals("author") && (c.getSequence().equals("first") ||
-						c.getSequence().equals("additional")))
-			.count() > 0) {
-			seqFound = true;
-			logger.info("sequence data found");
-		}
-		if (!seqFound) {
-			List<Integer> seqIds = Arrays.asList(0);
-			contributors.forEach(c -> {
-				int currentSeq = seqIds.get(0) + 1;
-				seqIds.set(0, currentSeq);
-				c.setSequence(Integer.toString(seqIds.get(0)));
-			});
-		}
-	}
-
-	private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
-		contributors.forEach(c -> {
-			if (c.isSimpleMatch()) {
-				logger.info("simple match on : " + c.getCreditName());
-				c.setName(author.getName());
-				c.setSurname(author.getSurname());
-				c.setOid(author.getOid());
-			}
-		});
-		updateRanks(contributors);
-	}
-
-	private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
-		logger.info("inside updateAuthorsSimilarityMatch ...");
-		contributors.forEach(c -> {
-			logger
-				.info(
-					c.getOid() + " - " + c.getCreditName() + " - " +
-						c.getName() + " - " + c.getSurname() + " - " +
-						c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
-						+ c.isSimpleMatch());
-		});
-
-		contributors
-			.stream()
-			.filter(c -> c.isBestMatch())
-			.forEach(c -> {
-				logger.info("similarity match on : " + c.getCreditName());
-				c.setName(author.getName());
-				c.setSurname(author.getSurname());
-				c.setOid(author.getOid());
-			});
-		updateRanks(contributors);
-	}
-
 	@Test
-	@Ignore
-	public void authorSimilarityMatchTest() throws Exception {
-		logger.info("running authorSimilarityMatchTest ....");
-		authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
-	}
-
-	@Test
-	private void authorSimpleMatchTest() throws Exception {
+	public void authorMatchTest() throws Exception {
 		logger.info("running authorSimpleMatchTest ....");
-		authorMatchTest("activity_work_0000-0003-2760-1191.xml");
-	}
-
-	private void authorMatchTest(String orcidWork)
-		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+		String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
 		AuthorData author = new AuthorData();
 		author.setName(nameA);
 		author.setSurname(surnameA);
@@ -272,55 +120,9 @@ public class OrcidNoDoiTest {
 			logger.error("parsing xml", e);
 		}
 		assertNotNull(workData);
-		int matchCounter = 0;
-		List<Integer> matchCounters = Arrays.asList(matchCounter);
-		Contributor contributor = null;
-		workData.getContributors().forEach(c -> {
-			if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
-				normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
-				((author.getOtherName() != null)
-					&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
-				matchCounters.set(0, matchCounters.get(0) + 1);
-				c.setSimpleMatch(true);
-			}
-		});
-		logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
-		if (matchCounters.get(0) == 1) {
-			updateAuthorsSimpleMatch(workData.getContributors(), author);
-		} else if (matchCounters.get(0) > 1) {
-			Optional<Contributor> optCon = workData
-				.getContributors()
-				.stream()
-				.filter(c -> c.isSimpleMatch())
-				.map(c -> {
-					c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
-					logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
-					return c;
-				})
-				.filter(c -> c.getScore() >= threshold)
-				.max(Comparator.comparing(c -> c.getScore()));
-			Contributor bestMatchContributor = null;
-			if (optCon.isPresent()) {
-				bestMatchContributor = optCon.get();
-				bestMatchContributor.setBestMatch(true);
-				logger.info("best match: " + bestMatchContributor.getCreditName());
-				updateAuthorsSimilarityMatch(workData.getContributors(), author);
-			}
-
-		}
-
-		logger.info("UPDATED contributors: ");
-		workData.getContributors().forEach(c -> {
-			logger
-				.info(
-					c.getOid() + " - " + c.getCreditName() + " - " +
-						c.getName() + " - " + c.getSurname() + " - " +
-						c.getRole() + " - " + c.getSequence());
-		});
+		AuthorMatcher.match(author, workData.getContributors());
+		GsonBuilder builder = new GsonBuilder();
+		Gson gson = builder.create();
+		logger.info(gson.toJson(workData));
 	}
 }
-
-//
-//		orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
-//		no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
-//
\ No newline at end of file

From b2213b6435dd3180adff6a7546e9f03337e8056c Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Fri, 26 Jun 2020 17:27:34 +0200
Subject: [PATCH 03/34] merged with dnet version

---
 .../orcid/ActivitiesDecompressor.java         |   2 +-
 .../doiboost/orcid/SummariesDecompressor.java |   2 +-
 .../doiboost/orcid/json/JsonHelper.java       |   1 +
 .../orcidnodoi/ActivitiesDumpReader.java      |  30 +-
 .../orcidnodoi/GenOrcidAuthorWork.java        |   7 +-
 .../SparkGenEnrichedOrcidWorks.java           |  59 +--
 .../orcidnodoi/proto/ProtoWriter.java         | 427 ++++++++++++++++++
 .../oozie_app/workflow.xml                    |   2 +-
 8 files changed, 483 insertions(+), 47 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
index 80ccd71a1..02d2b267b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
@@ -19,9 +19,9 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
-import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 
 public class ActivitiesDecompressor {
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
index 603bfedf6..29d72ed0b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
@@ -19,9 +19,9 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
-import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 
 public class SummariesDecompressor {
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
index 13a3cee8f..bfd6f7447 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
@@ -3,6 +3,7 @@ package eu.dnetlib.doiboost.orcid.json;
 
 import com.google.gson.Gson;
 import com.google.gson.JsonObject;
+
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 
 public class JsonHelper {
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index 7eb6faf54..506641b81 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -1,10 +1,12 @@
 
 package eu.dnetlib.doiboost.orcidnodoi;
 
-import eu.dnetlib.doiboost.orcid.json.JsonHelper;
-import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
-import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
-import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
+
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.hadoop.conf.Configuration;
@@ -17,11 +19,10 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URI;
+import eu.dnetlib.doiboost.orcid.json.JsonHelper;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
 
 public class ActivitiesDumpReader {
 
@@ -82,7 +83,8 @@ public class ActivitiesDumpReader {
 							while ((line = br.readLine()) != null) {
 								buffer.append(line);
 							}
-							WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
+							WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi
+								.VTDParseWorkData(buffer.toString().getBytes());
 							if (workDataNoDoi != null) {
 								if (workDataNoDoi.getErrorCode() != null) {
 									errorFromOrcidFound += 1;
@@ -94,9 +96,11 @@ public class ActivitiesDumpReader {
 												+ entry.getName());
 									continue;
 								}
-								boolean isDoiFound = workDataNoDoi.getExtIds().stream()
-										.filter(e -> e.getType()!=null)
-										.anyMatch(e -> e.getType().equals("doi"));
+								boolean isDoiFound = workDataNoDoi
+									.getExtIds()
+									.stream()
+									.filter(e -> e.getType() != null)
+									.anyMatch(e -> e.getType().equals("doi"));
 								if (!isDoiFound) {
 									String jsonData = JsonHelper.createOidWork(workDataNoDoi);
 									Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
index b82f4bc4c..bbaa5acca 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -1,15 +1,16 @@
 
 package eu.dnetlib.doiboost.orcidnodoi;
 
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.doiboost.orcid.OrcidDSManager;
+import java.io.IOException;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.mortbay.log.Log;
 
-import java.io.IOException;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.OrcidDSManager;
 
 public class GenOrcidAuthorWork extends OrcidDSManager {
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 6bb31bcf6..9d9c5bc4a 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -1,13 +1,12 @@
 
 package eu.dnetlib.doiboost.orcidnodoi;
 
-import com.google.gson.Gson;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonParser;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
-import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
-import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Optional;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
@@ -19,14 +18,17 @@ import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
 import scala.Tuple2;
 
-import java.io.IOException;
-import java.util.Objects;
-import java.util.Optional;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
 public class SparkGenEnrichedOrcidWorks {
 
 	public static void main(String[] args) throws IOException, Exception {
@@ -67,27 +69,28 @@ public class SparkGenEnrichedOrcidWorks {
 						Encoders.bean(AuthorData.class));
 
 				JavaPairRDD<Text, Text> activitiesRDD = sc
-					.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
+					.sequenceFile(workingPath + outputWorksPath + "works_X.seq", Text.class, Text.class);
 				Dataset<WorkDataNoDoi> activitiesDataset = spark
 					.createDataset(
 						activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
 						Encoders.bean(WorkDataNoDoi.class));
 
 				activitiesDataset
-						.joinWith(
-								summariesDataset,
-								activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
-						.map(
-								(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
-									WorkDataNoDoi w = value._1;
-									AuthorData a = value._2;
-									AuthorMatcher.match(a, w.getContributors());
-									return new Tuple2<>(a.getOid(), w);
-								},
-								Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
-						.filter(Objects::nonNull)
-						.toJavaRDD()
-						.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
+					.joinWith(
+						summariesDataset,
+						activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
+					.map(
+						(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
+							WorkDataNoDoi w = value._1;
+							AuthorData a = value._2;
+							AuthorMatcher.match(a, w.getContributors());
+							return new Tuple2<>(a.getOid(), w);
+						},
+						Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
+					.filter(Objects::nonNull)
+					.toJavaRDD()
+					.saveAsTextFile(workingPath + outputEnrichedWorksPath);
+				;
 			});
 	}
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java
new file mode 100644
index 000000000..01b172359
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java
@@ -0,0 +1,427 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.proto;
+
+public class ProtoWriter {
+
+}
+//
+//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
+//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
+//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
+//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
+//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
+//
+//import java.io.IOException;
+//import java.io.InputStream;
+//import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.List;
+//import java.util.Map;
+//
+//import org.apache.commons.io.IOUtils;
+//import org.apache.commons.lang3.StringUtils;
+//
+//import com.google.gson.Gson;
+//import com.google.gson.JsonArray;
+//import com.google.gson.JsonElement;
+//import com.google.gson.JsonObject;
+//import com.googlecode.protobuf.format.JsonFormat;
+//
+//import eu.dnetlib.actionmanager.actions.ActionFactory;
+//import eu.dnetlib.actionmanager.actions.AtomicAction;
+//import eu.dnetlib.actionmanager.common.Agent;
+//import eu.dnetlib.data.mapreduce.hbase.Reporter;
+//import eu.dnetlib.data.mapreduce.util.StreamUtils;
+//import eu.dnetlib.data.proto.FieldTypeProtos;
+//import eu.dnetlib.data.proto.FieldTypeProtos.Author;
+//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
+//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
+//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
+//import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
+//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
+//import eu.dnetlib.data.proto.KindProtos;
+//import eu.dnetlib.data.proto.OafProtos;
+//import eu.dnetlib.data.proto.ResultProtos;
+//import eu.dnetlib.data.proto.TypeProtos;
+//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
+//import eu.dnetlib.miscutils.collections.Pair;
+//import eu.dnetlib.miscutils.datetime.DateUtils;
+//import eu.dnetlib.pace.model.Person;
+//
+//public class ProtoWriter {
+//
+//    public static final String ORCID = "ORCID";
+//    public final static String orcidPREFIX = "orcid_______";
+//    public static final String OPENAIRE_PREFIX = "openaire____";
+//    public static final String SEPARATOR = "::";
+//
+//    private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
+//
+//        {
+//            put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
+//
+//        }
+//    };
+//
+//    // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
+//    private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
+//
+//        {
+//            put("ark".toLowerCase(), new Pair<>("ark", "ark"));
+//            put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
+//            put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
+//            put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
+//            put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
+//            put("urn".toLowerCase(), new Pair<>("urn", "urn"));
+//        }
+//    };
+//
+//    static Map<String, Map<String, String>> typologiesMapping;
+//
+//    static {
+//        try {
+//            final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
+//            final String tt = IOUtils.toString(is);
+//            typologiesMapping = new Gson().fromJson(tt, Map.class);
+//        } catch (final IOException e) {
+//            e.printStackTrace();
+//        }
+//    }
+//
+//    public static final String PID_TYPES = "dnet:pid_types";
+//
+//    public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
+//                                                                        final ActionFactory factory,
+//                                                                        final String setName,
+//                                                                        final Agent agent,
+//                                                                        final Reporter context) {
+//
+//        if (!isValid(rootElement, context)) { return null; }
+//
+//        // Create OAF proto
+//
+//        final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
+//
+//        oaf.setDataInfo(
+//                DataInfo.newBuilder()
+//                        .setDeletedbyinference(false)
+//                        .setInferred(false)
+//                        .setTrust("0.9")
+//                        .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
+//                        .build());
+//
+//        // Adding kind
+//        oaf.setKind(KindProtos.Kind.entity);
+//
+//        oaf.setLastupdatetimestamp(DateUtils.now());
+//
+//        // creating result proto
+//        final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
+//
+//        entity.setDateofcollection("2018-10-22");
+//        entity.setDateoftransformation(DateUtils.now_ISO8601());
+//
+//        // Adding external ids
+//        StreamUtils.toStream(externalIds.keySet().iterator())
+//                .forEach(jsonExtId -> {
+//                    final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
+//                    final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
+//                    final String extId = getStringValue(rootElement, jsonExtId);
+//                    if (StringUtils.isNotBlank(extId)) {
+//                        entity.addPid(StructuredProperty.newBuilder()
+//                                .setValue(extId)
+//                                .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
+//                                        .setSchemename("dnet:pid_types").build())
+//                                .build());
+//                    }
+//                });
+//
+//        // Create result field
+//        final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
+//
+//        // Create metadata proto
+//        final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
+//
+//        // Adding source
+//        final String source = getStringValue(rootElement, "source");
+//        if (StringUtils.isNotBlank(source)) {
+//            metadata.addSource(StringField.newBuilder().setValue(source).build());
+//        }
+//
+//        // Adding title
+//        final String title = createRepeatedField(rootElement, "titles");
+//        if (StringUtils.isBlank(title)) {
+//            context.incrementCounter("filtered", "title_not_found", 1);
+//            return null;
+//        }
+//        metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
+//                .setValue(title)
+//                .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
+//                .build());
+//
+//        // Adding identifier
+//        final String id = getStringValue(rootElement, "id");
+//        String sourceId = null;
+//        if (id != null) {
+//            entity.addOriginalId(id);
+//            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
+//        } else {
+//            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
+//        }
+//        entity.setId(sourceId);
+//
+//        // Adding relevant date
+//        settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
+//
+//        // Adding collectedfrom
+//        final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
+//                .setValue(ORCID)
+//                .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
+//                .build();
+//        entity.addCollectedfrom(collectedFrom);
+//
+//        // Adding type
+//        final String type = getStringValue(rootElement, "type");
+//        String cobjValue = "";
+//        if (StringUtils.isNotBlank(type)) {
+//
+//            metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
+//                    .setClassid(type)
+//                    .setClassname(type)
+//                    .setSchemeid("dnet:dataCite_resource")
+//                    .setSchemename("dnet:dataCite_resource")
+//                    .build());
+//
+//            final String typeValue = typologiesMapping.get(type).get("value");
+//            cobjValue = typologiesMapping.get(type).get("cobj");
+//            final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
+//
+//            // Adding hostedby
+//            instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
+//                    .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
+//                    .setValue("Unknown Repository")
+//                    .build());
+//
+//            // Adding url
+//            final String url = createRepeatedField(rootElement, "urls");
+//            if (StringUtils.isNotBlank(url)) {
+//                instance.addUrl(url);
+//            }
+//
+//            final String pubDate = getPublicationDate(rootElement, "publication_date");
+//            if (StringUtils.isNotBlank(pubDate)) {
+//                instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
+//            }
+//
+//            instance.setCollectedfrom(collectedFrom);
+//
+//            // Adding accessright
+//            instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
+//                    .setClassid("UNKNOWN")
+//                    .setClassname("UNKNOWN")
+//                    .setSchemeid("dnet:access_modes")
+//                    .setSchemename("dnet:access_modes")
+//                    .build());
+//
+//            // Adding type
+//            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
+//                    .setClassid(cobjValue)
+//                    .setClassname(typeValue)
+//                    .setSchemeid("dnet:publication_resource")
+//                    .setSchemename("dnet:publication_resource")
+//                    .build());
+//
+//            result.addInstance(instance);
+//        } else {
+//            context.incrementCounter("filtered", "type_not_found", 1);
+//            return null;
+//        }
+//
+//        // Adding authors
+//        final List<Author> authors = createAuthors(rootElement);
+//        if (authors != null && authors.size() > 0) {
+//            metadata.addAllAuthor(authors);
+//        } else {
+//            context.incrementCounter("filtered", "author_not_found", 1);
+//            return null;
+//        }
+//
+//        metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
+//        result.setMetadata(metadata.build());
+//        entity.setResult(result.build());
+//        oaf.setEntity(entity.build());
+//
+//        final List<AtomicAction> actionList = new ArrayList<>();
+//
+//        actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
+//
+////		 System.out.println(JsonFormat.printToString(oaf.build()));
+//        return actionList;
+//
+//    }
+//
+//    public static List<Author> createAuthors(final JsonObject root) {
+//
+//        final String authorsJSONFieldName = "authors";
+//
+//        if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
+//
+//            final List<Author> authors = new ArrayList<>();
+//            final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
+//            int firstCounter = 0;
+//            int defaultCounter = 0;
+//            int rank = 1;
+//            int currentRank = 0;
+//
+//            for (final JsonElement item : jsonAuthors) {
+//                final JsonObject author = item.getAsJsonObject();
+//                final Author.Builder result = Author.newBuilder();
+//                if (item.isJsonObject()) {
+//                    final String surname = getStringValue(author, "surname");
+//                    final String name = getStringValue(author, "name");
+//                    final String oid = getStringValue(author, "oid");
+//                    final String seq = getStringValue(author, "seq");
+//                    if (StringUtils.isNotBlank(seq)) {
+//                        if (seq.equals("first")) {
+//                            firstCounter += 1;
+//                            rank = firstCounter;
+//
+//                        } else if (seq.equals("additional")) {
+//                            rank = currentRank + 1;
+//                        } else {
+//                            defaultCounter += 1;
+//                            rank = defaultCounter;
+//                        }
+//                    }
+//
+//                    if (StringUtils.isNotBlank(oid)) {
+//                        result.addPid(KeyValue.newBuilder()
+//                                .setValue(oid)
+//                                .setKey("ORCID")
+//                                .build());
+//                        result.setFullname(name + " " + surname);
+//                        if (StringUtils.isNotBlank(name)) {
+//                            result.setName(name);
+//                        }
+//                        if (StringUtils.isNotBlank(surname)) {
+//                            result.setSurname(surname);
+//                        }
+//                    } else {
+//                        String fullname = "";
+//                        if (StringUtils.isNotBlank(name)) {
+//                            fullname = name;
+//                        } else {
+//                            if (StringUtils.isNotBlank(surname)) {
+//                                fullname = surname;
+//                            }
+//                        }
+//                        Person p = new Person(fullname, false);
+//                        if (p.isAccurate()) {
+//                            result.setName(p.getNormalisedFirstName());
+//                            result.setSurname(p.getNormalisedSurname());
+//                            result.setFullname(p.getNormalisedFullname());
+//                        }
+//                        else {
+//                            result.setFullname(fullname);
+//                        }
+//                    }
+//                }
+//                result.setRank(rank);
+//                authors.add(result.build());
+//                currentRank = rank;
+//            }
+//            return authors;
+//
+//        }
+//        return null;
+//    }
+//
+//    private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
+//        String field = "";
+//        if (!rootElement.has(fieldName)) { return null; }
+//        if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
+//        if (rootElement.get(fieldName).isJsonArray()) {
+//            if (!isValidJsonArray(rootElement, fieldName)) { return null; }
+//            final StringBuilder ttl = new StringBuilder();
+//            getArrayValues(rootElement, fieldName).forEach(ttl::append);
+//            field = ttl.toString();
+//        } else {
+//            field = getStringValue(rootElement, fieldName);
+//        }
+//
+//        if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
+//            field = field.substring(1, field.length() - 1);
+//        }
+//        return field;
+//    }
+//
+//    private static void settingRelevantDate(final JsonObject rootElement,
+//                                            final ResultProtos.Result.Metadata.Builder metadata,
+//                                            final String jsonKey,
+//                                            final String dictionaryKey,
+//                                            final boolean addToDateOfAcceptance) {
+//
+//        final String pubDate = getPublicationDate(rootElement, "publication_date");
+//        if (StringUtils.isNotBlank(pubDate)) {
+//            if (addToDateOfAcceptance) {
+//                metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
+//            }
+//            metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
+//                    .setValue(pubDate)
+//                    .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
+//                    .build());
+//        }
+//    }
+//
+//    private static String getPublicationDate(final JsonObject rootElement,
+//                                             final String jsonKey) {
+//
+//        final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
+//        if (pubDateJson == null) { return null; }
+//        final String year = getStringValue(pubDateJson, "year");
+//        final String month = getStringValue(pubDateJson, "month");
+//        final String day = getStringValue(pubDateJson, "day");
+//
+//        if (StringUtils.isBlank(year)) { return null; }
+//        String pubDate = "".concat(year);
+//        if (StringUtils.isNotBlank(month)) {
+//            pubDate = pubDate.concat("-" + month);
+//            if (StringUtils.isNotBlank(day)) {
+//                pubDate = pubDate.concat("-" + day);
+//            } else {
+//                pubDate += "-01";
+//            }
+//        } else {
+//            pubDate += "-01-01";
+//        }
+//        if (isValidDate(pubDate)) { return pubDate; }
+//        return null;
+//    }
+//
+//    protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
+//
+//        final String type = getStringValue(rootElement, "type");
+//        if (!typologiesMapping.containsKey(type)) {
+//            context.incrementCounter("filtered", "unknowntype_" + type, 1);
+//            return false;
+//        }
+//
+//        if (!isValidJsonArray(rootElement, "titles")) {
+//            context.incrementCounter("filtered", "invalid_title", 1);
+//            return false;
+//        }
+//        return true;
+//    }
+//
+//    private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
+//        if (!rootElement.has(fieldName)) { return false; }
+//        final JsonElement jsonElement = rootElement.get(fieldName);
+//        if (jsonElement.isJsonNull()) { return false; }
+//        if (jsonElement.isJsonArray()) {
+//            final JsonArray jsonArray = jsonElement.getAsJsonArray();
+//            if (jsonArray.isJsonNull()) { return false; }
+//            if (jsonArray.get(0).isJsonNull()) { return false; }
+//        }
+//        return true;
+//    }
+//}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
index 2486bdb24..33fbdf875 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
@@ -509,7 +509,7 @@
             <mode>cluster</mode>
             <name>Gen_Enriched_Orcid_Works</name>
             <class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
-            <jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
+            <jar>dhp-doiboost-1.2.2-SNAPSHOT.jar</jar>
             <spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
             </spark-opts>
             <arg>-w</arg><arg>${workingPath}/</arg>

From b7b6be12a51c81b2b7469684cf18bc8a3014aec4 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Mon, 29 Jun 2020 18:03:16 +0200
Subject: [PATCH 04/34] fixed enriched works generation

---
 .../doiboost/orcid/json/JsonHelper.java       |  6 +--
 .../orcidnodoi/ActivitiesDumpReader.java      |  4 +-
 .../orcidnodoi/GenOrcidAuthorWork.java        |  1 +
 .../SparkGenEnrichedOrcidWorks.java           | 29 +++++------
 .../orcidnodoi/similarity/AuthorMatcher.java  | 48 +++++--------------
 .../orcidnodoi/xml/XMLRecordParserNoDoi.java  |  4 +-
 .../oozie_app/config-default.xml              | 17 +++++--
 .../oozie_app/workflow.xml                    | 24 +++++++---
 8 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
index bfd6f7447..94f7d8c91 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
@@ -2,16 +2,12 @@
 package eu.dnetlib.doiboost.orcid.json;
 
 import com.google.gson.Gson;
-import com.google.gson.JsonObject;
 
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 
 public class JsonHelper {
 
 	public static String createOidWork(WorkDataNoDoi workData) {
-		JsonObject oidWork = new JsonObject();
-		oidWork.addProperty("oid", workData.getOid());
-		oidWork.addProperty("work", new Gson().toJson(workData));
-		return oidWork.toString();
+		return new Gson().toJson(workData);
 	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index 506641b81..bf63568d8 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
 
 public class ActivitiesDumpReader {
 
-	private static final int MAX_XML_WORKS_PARSED = -1;
-	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
+	private static final int MAX_XML_WORKS_PARSED = 100;
+	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10;
 
 	public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
 		throws Exception {
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
index bbaa5acca..8dcee796c 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -45,6 +45,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
 		Log.info("HDFS URI: " + hdfsServerUri);
 		workingPath = parser.get("workingPath");
 		Log.info("Working Path: " + workingPath);
+		hdfsOrcidDefaultPath = workingPath;
 		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
 		Log.info("Activities File Name: " + activitiesFileNameTarGz);
 		outputWorksPath = parser.get("outputWorksPath");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 9d9c5bc4a..ae1e4dae6 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -24,6 +24,7 @@ import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.json.JsonHelper;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
@@ -31,9 +32,9 @@ import scala.Tuple2;
 
 public class SparkGenEnrichedOrcidWorks {
 
+	static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
+
 	public static void main(String[] args) throws IOException, Exception {
-		Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
-		logger.info("[ SparkGenerateDoiAuthorList STARTED]");
 
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
@@ -46,13 +47,9 @@ public class SparkGenEnrichedOrcidWorks {
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
-		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String workingPath = parser.get("workingPath");
-		logger.info("workingPath: ", workingPath);
 		final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
-		logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
 		final String outputWorksPath = parser.get("outputWorksPath");
-		logger.info("outputWorksPath: ", outputWorksPath);
 
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
@@ -67,30 +64,33 @@ public class SparkGenEnrichedOrcidWorks {
 					.createDataset(
 						summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
 						Encoders.bean(AuthorData.class));
+				logger.info("Authors data loaded: " + summariesDataset.count());
 
 				JavaPairRDD<Text, Text> activitiesRDD = sc
-					.sequenceFile(workingPath + outputWorksPath + "works_X.seq", Text.class, Text.class);
+					.sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class);
 				Dataset<WorkDataNoDoi> activitiesDataset = spark
 					.createDataset(
 						activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
 						Encoders.bean(WorkDataNoDoi.class));
+				logger.info("Works data loaded: " + activitiesDataset.count());
 
-				activitiesDataset
+				JavaRDD<Tuple2<String, String>> enrichedWorksRDD = activitiesDataset
 					.joinWith(
 						summariesDataset,
 						activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
 					.map(
-						(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
+						(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, String>>) value -> {
 							WorkDataNoDoi w = value._1;
 							AuthorData a = value._2;
 							AuthorMatcher.match(a, w.getContributors());
-							return new Tuple2<>(a.getOid(), w);
+							return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
 						},
-						Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
+						Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
 					.filter(Objects::nonNull)
-					.toJavaRDD()
-					.saveAsTextFile(workingPath + outputEnrichedWorksPath);
-				;
+					.toJavaRDD();
+				logger.info("Works enriched data created: " + enrichedWorksRDD.count());
+				enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath);
+				logger.info("Works enriched data saved");
 			});
 	}
 
@@ -105,6 +105,7 @@ public class SparkGenEnrichedOrcidWorks {
 	}
 
 	private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
+
 		WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
 		return workData;
 	}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
index 09fd8b36b..1e4c38bef 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -33,15 +33,13 @@ public class AuthorMatcher {
 		List<Integer> matchCounters = Arrays.asList(matchCounter);
 		Contributor contributor = null;
 		contributors.forEach(c -> {
-			if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
-				normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
-				((author.getOtherName() != null)
-					&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
+			if (simpleMatch(c.getCreditName(), author.getName()) ||
+				simpleMatch(c.getCreditName(), author.getSurname()) ||
+				simpleMatch(c.getCreditName(), author.getOtherName())) {
 				matchCounters.set(0, matchCounters.get(0) + 1);
 				c.setSimpleMatch(true);
 			}
 		});
-		logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
 		if (matchCounters.get(0) == 1) {
 			updateAuthorsSimpleMatch(contributors, author);
 		} else if (matchCounters.get(0) > 1) {
@@ -50,7 +48,6 @@ public class AuthorMatcher {
 				.filter(c -> c.isSimpleMatch())
 				.map(c -> {
 					c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
-					logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
 					return c;
 				})
 				.filter(c -> c.getScore() >= threshold)
@@ -59,24 +56,21 @@ public class AuthorMatcher {
 			if (optCon.isPresent()) {
 				bestMatchContributor = optCon.get();
 				bestMatchContributor.setBestMatch(true);
-				logger.info("best match: " + bestMatchContributor.getCreditName());
 				updateAuthorsSimilarityMatch(contributors, author);
 			}
 
 		}
 
-		logger.info("UPDATED contributors: ");
-		contributors.forEach(c -> {
-			logger
-				.info(
-					c.getOid() + " - " + c.getCreditName() + " - " +
-						c.getName() + " - " + c.getSurname() + " - " +
-						c.getRole() + " - " + c.getSequence());
-		});
+	}
+
+	private static boolean simpleMatch(String name, String searchValue) {
+		if (searchValue == null) {
+			return false;
+		}
+		return normalize(name).contains(normalize(searchValue));
 	}
 
 	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
-		logger.debug(authorSurname + " " + authorName + " vs " + contributor);
 		String[] contributorSplitted = contributor.split(" ");
 		if (contributorSplitted.length == 0) {
 			return 0.0;
@@ -90,10 +84,6 @@ public class AuthorMatcher {
 			}
 			contributorSurname = joiner.toString();
 		}
-		logger
-			.debug(
-				"contributorName: " + contributorName +
-					" contributorSurname: " + contributorSurname);
 		String authorNameNrm = normalize(authorName);
 		String authorSurnameNrm = normalize(authorSurname);
 		String contributorNameNrm = normalize(contributorName);
@@ -108,8 +98,6 @@ public class AuthorMatcher {
 
 	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
 		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
-		logger
-			.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + "   score: " + Double.toString(score));
 		return score;
 	}
 
@@ -118,6 +106,9 @@ public class AuthorMatcher {
 	}
 
 	private static String normalize(final String s) {
+		if (s == null) {
+			return new String("");
+		}
 		return nfd(s)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
@@ -142,7 +133,6 @@ public class AuthorMatcher {
 	private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
 		contributors.forEach(c -> {
 			if (c.isSimpleMatch()) {
-				logger.info("simple match on : " + c.getCreditName());
 				c.setName(author.getName());
 				c.setSurname(author.getSurname());
 				c.setOid(author.getOid());
@@ -152,21 +142,10 @@ public class AuthorMatcher {
 	}
 
 	private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
-		logger.info("inside updateAuthorsSimilarityMatch ...");
-		contributors.forEach(c -> {
-			logger
-				.info(
-					c.getOid() + " - " + c.getCreditName() + " - " +
-						c.getName() + " - " + c.getSurname() + " - " +
-						c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
-						+ c.isSimpleMatch());
-		});
-
 		contributors
 			.stream()
 			.filter(c -> c.isBestMatch())
 			.forEach(c -> {
-				logger.info("similarity match on : " + c.getCreditName());
 				c.setName(author.getName());
 				c.setSurname(author.getSurname());
 				c.setOid(author.getOid());
@@ -184,7 +163,6 @@ public class AuthorMatcher {
 						c.getSequence().equals("additional")))
 			.count() > 0) {
 			seqFound = true;
-			logger.info("sequence data found");
 		}
 		if (!seqFound) {
 			List<Integer> seqIds = Arrays.asList(0);
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
index 6e5771547..ae96a322f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
@@ -41,7 +41,6 @@ public class XMLRecordParserNoDoi {
 	public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
 		throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
 		NavException, XPathEvalException {
-		logger.info("parsing xml ...");
 		final VTDGen vg = new VTDGen();
 		vg.setDoc(bytes);
 		vg.parse(true);
@@ -191,6 +190,9 @@ public class XMLRecordParserNoDoi {
 				nameIndex++;
 			}
 		}
+		if (contributors.size() == 0) {
+			return contributors;
+		}
 
 		int sequenceIndex = 0;
 		ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
index f2d51e260..3068562d0 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
@@ -8,15 +8,24 @@
         <value>true</value>
     </property>
     <property>
-      <name>oozie.launcher.mapreduce.map.java.opts</name>
-      <value>-Xmx4g</value>
+        <name>oozie.launcher.mapreduce.map.java.opts</name>
+        <value>-Xmx4g</value>
     </property>
     <property>
         <name>jobTracker</name>
-        <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
+        <value>yarnRM</value>
     </property>
     <property>
         <name>nameNode</name>
-        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
+        <value>hdfs://nameservice1</value>
+    </property>
+
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
     </property>
 </configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
index 33fbdf875..df5e0e76f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
@@ -71,10 +71,9 @@
             <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
         </property>  
     </parameters>
-    
+
     <start to="ResetWorkingPath"/>
-    
-    
+
     <kill name="Kill">
         <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
     </kill>
@@ -133,6 +132,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -169,6 +169,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -205,6 +206,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -241,6 +243,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -277,6 +280,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -313,6 +317,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -349,6 +354,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -386,6 +392,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -422,6 +429,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -458,6 +466,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
@@ -494,11 +503,12 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
         <ok to="join_node"/>
         <error to="Kill"/>
     </action>
-    
+
     <join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
 
     <action name="Gen_Enriched_Orcid_Works">
@@ -509,12 +519,14 @@
             <mode>cluster</mode>
             <name>Gen_Enriched_Orcid_Works</name>
             <class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
-            <jar>dhp-doiboost-1.2.2-SNAPSHOT.jar</jar>
+            <jar>dhp-doiboost-1.2.4-SNAPSHOT.jar</jar>
             <spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
             </spark-opts>
             <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>-</arg>
             <arg>-ow</arg><arg>no_doi_works/</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/output</arg>
         </spark>
         <ok to="End"/>
         <error to="Kill"/>

From 5525f57ec8f9ef07d74ab30c54ab8d39e924d413 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Wed, 1 Jul 2020 18:36:14 +0200
Subject: [PATCH 05/34] converter from orcid work json to oaf

---
 .../orcidnodoi/oaf/OrcidWorkToOAF.java        | 420 +++++++++++++++++
 .../orcidnodoi/proto/ProtoWriter.java         | 427 ------------------
 .../orcidnodoi/util/DumpToActionsUtility.java | 107 +++++
 .../doiboost/orcidnodoi/util/Pair.java        |  30 ++
 .../orcidnodoi/mappings/typologies.json       |  41 ++
 5 files changed, 598 insertions(+), 427 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java
new file mode 100644
index 000000000..673abb407
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java
@@ -0,0 +1,420 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.oaf;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import eu.dnetlib.dhp.common.PacePerson;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks;
+import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
+import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
+
+public class OrcidWorkToOAF {
+
+    static Logger logger = LoggerFactory.getLogger(OrcidWorkToOAF.class);
+
+    public static final String ORCID = "ORCID";
+    public final static String orcidPREFIX = "orcid_______";
+    public static final String OPENAIRE_PREFIX = "openaire____";
+    public static final String SEPARATOR = "::";
+
+    private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
+
+        {
+            put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
+
+        }
+    };
+
+    // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
+    private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
+
+        {
+            put("ark".toLowerCase(), new Pair<>("ark", "ark"));
+            put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
+            put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
+            put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
+            put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
+            put("urn".toLowerCase(), new Pair<>("urn", "urn"));
+        }
+    };
+
+    static Map<String, Map<String, String>> typologiesMapping;
+
+    static {
+        try {
+            final String tt = IOUtils.toString(OrcidWorkToOAF.class.getResourceAsStream(
+                                            "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
+            typologiesMapping = new Gson().fromJson(tt, Map.class);
+        } catch (final Exception e) {
+            logger.error("loading typologies", e);
+        }
+    }
+
+    public static final String PID_TYPES = "dnet:pid_types";
+
+    public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement, final String setName) {
+
+        if (!isValid(rootElement/*, context*/)) { return null; }
+
+        Publication publication = new Publication();
+
+        final DataInfo dataInfo = new DataInfo();
+        dataInfo.setDeletedbyinference(false);
+        dataInfo.setInferred(false);
+        dataInfo.setTrust("0.9");
+        dataInfo.setProvenanceaction(
+                mapQualifier(
+                        "sysimport:actionset:orcidworks-no-doi",
+                        "sysimport:actionset:orcidworks-no-doi",
+                        "dnet:provenanceActions",
+                    "dnet:provenanceActions"));
+        publication.setDataInfo(dataInfo);
+
+        publication.setLastupdatetimestamp(new Date().getTime());
+
+        publication.setDateofcollection("2019-10-22");
+        publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
+
+        // Adding external ids
+        externalIds.keySet().stream()
+                .forEach(jsonExtId -> {
+                    final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
+                    final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
+                    final String extId = getStringValue(rootElement, jsonExtId);
+                    if (StringUtils.isNotBlank(extId)) {
+                        publication.getExternalReference().add(
+                                convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
+                    }
+                });
+
+        // Adding source
+//        final String source = getStringValue(rootElement, "source");
+//        if (StringUtils.isNotBlank(source)) {
+//            metadata.addSource(StringField.newBuilder().setValue(source).build());
+//        }
+
+        // Adding titles
+        final List<String> titles = createRepeatedField(rootElement, "titles");
+        if (titles==null || titles.isEmpty()) {
+//            context.incrementCounter("filtered", "title_not_found", 1);
+            return null;
+        }
+        Qualifier q = mapQualifier("main title","main title","dnet:dataCite_title","dnet:dataCite_title");
+        publication.setTitle(
+                   titles
+                        .stream()
+                        .map(t -> {
+                            return mapStructuredProperty(t, q, null);
+                        })
+                        .collect(Collectors.toList()));
+        // Adding identifier
+        final String id = getStringValue(rootElement, "id");
+        String sourceId = null;
+        if (id != null) {
+            publication.setOriginalId(Arrays.asList(id));
+            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase()));
+        } else {
+            String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(","));
+            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase()));
+        }
+        publication.setId(sourceId);
+
+        // Adding relevant date
+        settingRelevantDate(rootElement, publication, "publication_date", "issued", true);
+
+        // Adding collectedfrom
+        publication.setCollectedfrom(Arrays.asList(createCollectedFrom()));
+
+        // Adding type
+        final String type = getStringValue(rootElement, "type");
+        String cobjValue = "";
+        if (StringUtils.isNotBlank(type)) {
+            publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource"));
+
+            final String typeValue = typologiesMapping.get(type).get("value");
+            cobjValue = typologiesMapping.get(type).get("cobj");
+            final Instance instance = new Instance();
+
+            // Adding hostedby
+            instance.setHostedby(createHostedBy());
+
+            // Adding url
+            final List<String> urls = createRepeatedField(rootElement, "urls");
+            if (urls!=null && !urls.isEmpty()) {
+                instance.setUrl(urls);
+            }
+
+            final String pubDate = getPublicationDate(rootElement, "publication_date");
+            if (StringUtils.isNotBlank(pubDate)) {
+                instance.setDateofacceptance(mapStringField(pubDate, null));
+            }
+
+            instance.setCollectedfrom(createCollectedFrom());
+
+            // Adding accessright
+            instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes"));
+
+            // Adding type
+            instance.setInstancetype(mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource"));
+
+            publication.setInstance(Arrays.asList(instance));
+        } else {
+//            context.incrementCounter("filtered", "type_not_found", 1);
+            return null;
+        }
+
+        // Adding authors
+        final List<Author> authors = createAuthors(rootElement);
+        if (authors != null && authors.size() > 0) {
+            publication.setAuthor(authors);
+        } else {
+//            context.incrementCounter("filtered", "author_not_found", 1);
+            return null;
+        }
+        String classValue = getDefaultResulttype(cobjValue);
+        publication.setResulttype(mapQualifier(classValue, classValue,"dnet:result_typologies", "dnet:result_typologies"));
+        return publication;
+    }
+
+    public static List<Author> createAuthors(final JsonObject root) {
+
+        final String authorsJSONFieldName = "authors";
+
+        if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
+
+            final List<Author> authors = new ArrayList<>();
+            final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
+            int firstCounter = 0;
+            int defaultCounter = 0;
+            int rank = 1;
+            int currentRank = 0;
+
+            for (final JsonElement item : jsonAuthors) {
+                final JsonObject jsonAuthor = item.getAsJsonObject();
+                final Author author = new Author();
+                if (item.isJsonObject()) {
+                    final String surname = getStringValue(jsonAuthor, "surname");
+                    final String name = getStringValue(jsonAuthor, "name");
+                    final String oid = getStringValue(jsonAuthor, "oid");
+                    final String seq = getStringValue(jsonAuthor, "seq");
+                    if (StringUtils.isNotBlank(seq)) {
+                        if (seq.equals("first")) {
+                            firstCounter += 1;
+                            rank = firstCounter;
+
+                        } else if (seq.equals("additional")) {
+                            rank = currentRank + 1;
+                        } else {
+                            defaultCounter += 1;
+                            rank = defaultCounter;
+                        }
+                    }
+
+                    if (StringUtils.isNotBlank(oid)) {
+                        author.setPid(Arrays.asList(mapAuthorId(oid)));
+                        author.setFullname(name + " " + surname);
+                        if (StringUtils.isNotBlank(name)) {
+                            author.setName(name);
+                        }
+                        if (StringUtils.isNotBlank(surname)) {
+                            author.setSurname(surname);
+                        }
+                    } else {
+                        String fullname = "";
+                        if (StringUtils.isNotBlank(name)) {
+                            fullname = name;
+                        } else {
+                            if (StringUtils.isNotBlank(surname)) {
+                                fullname = surname;
+                            }
+                        }
+                        PacePerson p = new PacePerson(fullname, false);
+                        if (p.isAccurate()) {
+                            author.setName(p.getNormalisedFirstName());
+                            author.setSurname(p.getNormalisedSurname());
+                            author.setFullname(p.getNormalisedFullname());
+                        }
+                        else {
+                            author.setFullname(fullname);
+                        }
+                    }
+                }
+                author.setRank(rank);
+                authors.add(author);
+                currentRank = rank;
+            }
+            return authors;
+
+        }
+        return null;
+    }
+
+    private static List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
+        if (!rootElement.has(fieldName)) { return null; }
+        if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
+        if (rootElement.get(fieldName).isJsonArray()) {
+            if (!isValidJsonArray(rootElement, fieldName)) { return null; }
+            return  getArrayValues(rootElement, fieldName);
+        } else {
+            String field = getStringValue(rootElement, fieldName);
+            return Arrays.asList(cleanField(field));
+        }
+    }
+
+    private static String cleanField(String value) {
+        if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
+            value = value.substring(1, value.length() - 1);
+        }
+        return value;
+    }
+
+    private static void settingRelevantDate(final JsonObject rootElement,
+                                            final Publication publication,
+                                            final String jsonKey,
+                                            final String dictionaryKey,
+                                            final boolean addToDateOfAcceptance) {
+
+        final String pubDate = getPublicationDate(rootElement, "publication_date");
+        if (StringUtils.isNotBlank(pubDate)) {
+            if (addToDateOfAcceptance) {
+                publication.setDateofacceptance(mapStringField(pubDate, null));
+            }
+            Qualifier q = mapQualifier(dictionaryKey,dictionaryKey,"dnet:dataCite_date","dnet:dataCite_date");
+            publication.setRelevantdate(
+                    Arrays.asList(pubDate)
+                            .stream()
+                            .map(r -> {
+                                return mapStructuredProperty(r, q, null);
+                            })
+                            .collect(Collectors.toList()));
+        }
+    }
+
+    private static String getPublicationDate(final JsonObject rootElement,
+                                             final String jsonKey) {
+
+        final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
+        if (pubDateJson == null) { return null; }
+        final String year = getStringValue(pubDateJson, "year");
+        final String month = getStringValue(pubDateJson, "month");
+        final String day = getStringValue(pubDateJson, "day");
+
+        if (StringUtils.isBlank(year)) { return null; }
+        String pubDate = "".concat(year);
+        if (StringUtils.isNotBlank(month)) {
+            pubDate = pubDate.concat("-" + month);
+            if (StringUtils.isNotBlank(day)) {
+                pubDate = pubDate.concat("-" + day);
+            } else {
+                pubDate += "-01";
+            }
+        } else {
+            pubDate += "-01-01";
+        }
+        if (isValidDate(pubDate)) { return pubDate; }
+        return null;
+    }
+
+    protected static boolean isValid(final JsonObject rootElement/*, final Reporter context*/) {
+
+        final String type = getStringValue(rootElement, "type");
+        if (!typologiesMapping.containsKey(type)) {
+//            context.incrementCounter("filtered", "unknowntype_" + type, 1);
+            return false;
+        }
+
+        if (!isValidJsonArray(rootElement, "titles")) {
+//            context.incrementCounter("filtered", "invalid_title", 1);
+            return false;
+        }
+        return true;
+    }
+
+    private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
+        if (!rootElement.has(fieldName)) { return false; }
+        final JsonElement jsonElement = rootElement.get(fieldName);
+        if (jsonElement.isJsonNull()) { return false; }
+        if (jsonElement.isJsonArray()) {
+            final JsonArray jsonArray = jsonElement.getAsJsonArray();
+            if (jsonArray.isJsonNull()) { return false; }
+            if (jsonArray.get(0).isJsonNull()) { return false; }
+        }
+        return true;
+    }
+
+    private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
+        final Qualifier qualifier = new Qualifier();
+        qualifier.setClassid(classId);
+        qualifier.setClassname(className);
+        qualifier.setSchemeid(schemeId);
+        qualifier.setSchemename(schemeName);
+        return qualifier;
+    }
+
+    private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) {
+        ExternalReference ex = new ExternalReference();
+        ex.setRefidentifier(extId);
+        ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName ));
+        return ex;
+    }
+
+    private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
+        if (value == null | StringUtils.isBlank(value)) {
+            return null;
+        }
+
+        final StructuredProperty structuredProperty = new StructuredProperty();
+        structuredProperty.setValue(value);
+        structuredProperty.setQualifier(qualifier);
+        structuredProperty.setDataInfo(dataInfo);
+        return structuredProperty;
+    }
+
+    private static Field<String> mapStringField(String value, DataInfo dataInfo) {
+        if (value == null || StringUtils.isBlank(value)) {
+            return null;
+        }
+
+        final Field<String> stringField = new Field<>();
+        stringField.setValue(value);
+        stringField.setDataInfo(dataInfo);
+        return stringField;
+    }
+
+    private static KeyValue createCollectedFrom() {
+        KeyValue cf = new KeyValue();
+        cf.setValue(ORCID);
+        cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
+        return cf;
+    }
+
+    private static KeyValue createHostedBy() {
+        KeyValue hb = new KeyValue();
+        hb.setValue("Unknown Repository");
+        hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
+        return hb;
+    }
+
+    private static StructuredProperty mapAuthorId(String orcidId) {
+        final StructuredProperty sp = new StructuredProperty();
+        sp.setValue(orcidId);
+        final Qualifier q = new Qualifier();
+        q.setClassid("ORCID");
+        q.setClassname("ORCID");
+        sp.setQualifier(q);
+        return sp;
+    }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java
deleted file mode 100644
index 01b172359..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java
+++ /dev/null
@@ -1,427 +0,0 @@
-
-package eu.dnetlib.doiboost.orcidnodoi.proto;
-
-public class ProtoWriter {
-
-}
-//
-//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
-//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
-//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
-//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
-//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
-//
-//import java.io.IOException;
-//import java.io.InputStream;
-//import java.util.ArrayList;
-//import java.util.HashMap;
-//import java.util.List;
-//import java.util.Map;
-//
-//import org.apache.commons.io.IOUtils;
-//import org.apache.commons.lang3.StringUtils;
-//
-//import com.google.gson.Gson;
-//import com.google.gson.JsonArray;
-//import com.google.gson.JsonElement;
-//import com.google.gson.JsonObject;
-//import com.googlecode.protobuf.format.JsonFormat;
-//
-//import eu.dnetlib.actionmanager.actions.ActionFactory;
-//import eu.dnetlib.actionmanager.actions.AtomicAction;
-//import eu.dnetlib.actionmanager.common.Agent;
-//import eu.dnetlib.data.mapreduce.hbase.Reporter;
-//import eu.dnetlib.data.mapreduce.util.StreamUtils;
-//import eu.dnetlib.data.proto.FieldTypeProtos;
-//import eu.dnetlib.data.proto.FieldTypeProtos.Author;
-//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
-//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
-//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
-//import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
-//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
-//import eu.dnetlib.data.proto.KindProtos;
-//import eu.dnetlib.data.proto.OafProtos;
-//import eu.dnetlib.data.proto.ResultProtos;
-//import eu.dnetlib.data.proto.TypeProtos;
-//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
-//import eu.dnetlib.miscutils.collections.Pair;
-//import eu.dnetlib.miscutils.datetime.DateUtils;
-//import eu.dnetlib.pace.model.Person;
-//
-//public class ProtoWriter {
-//
-//    public static final String ORCID = "ORCID";
-//    public final static String orcidPREFIX = "orcid_______";
-//    public static final String OPENAIRE_PREFIX = "openaire____";
-//    public static final String SEPARATOR = "::";
-//
-//    private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
-//
-//        {
-//            put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
-//
-//        }
-//    };
-//
-//    // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
-//    private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
-//
-//        {
-//            put("ark".toLowerCase(), new Pair<>("ark", "ark"));
-//            put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
-//            put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
-//            put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
-//            put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
-//            put("urn".toLowerCase(), new Pair<>("urn", "urn"));
-//        }
-//    };
-//
-//    static Map<String, Map<String, String>> typologiesMapping;
-//
-//    static {
-//        try {
-//            final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
-//            final String tt = IOUtils.toString(is);
-//            typologiesMapping = new Gson().fromJson(tt, Map.class);
-//        } catch (final IOException e) {
-//            e.printStackTrace();
-//        }
-//    }
-//
-//    public static final String PID_TYPES = "dnet:pid_types";
-//
-//    public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
-//                                                                        final ActionFactory factory,
-//                                                                        final String setName,
-//                                                                        final Agent agent,
-//                                                                        final Reporter context) {
-//
-//        if (!isValid(rootElement, context)) { return null; }
-//
-//        // Create OAF proto
-//
-//        final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
-//
-//        oaf.setDataInfo(
-//                DataInfo.newBuilder()
-//                        .setDeletedbyinference(false)
-//                        .setInferred(false)
-//                        .setTrust("0.9")
-//                        .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
-//                        .build());
-//
-//        // Adding kind
-//        oaf.setKind(KindProtos.Kind.entity);
-//
-//        oaf.setLastupdatetimestamp(DateUtils.now());
-//
-//        // creating result proto
-//        final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
-//
-//        entity.setDateofcollection("2018-10-22");
-//        entity.setDateoftransformation(DateUtils.now_ISO8601());
-//
-//        // Adding external ids
-//        StreamUtils.toStream(externalIds.keySet().iterator())
-//                .forEach(jsonExtId -> {
-//                    final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
-//                    final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
-//                    final String extId = getStringValue(rootElement, jsonExtId);
-//                    if (StringUtils.isNotBlank(extId)) {
-//                        entity.addPid(StructuredProperty.newBuilder()
-//                                .setValue(extId)
-//                                .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
-//                                        .setSchemename("dnet:pid_types").build())
-//                                .build());
-//                    }
-//                });
-//
-//        // Create result field
-//        final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
-//
-//        // Create metadata proto
-//        final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
-//
-//        // Adding source
-//        final String source = getStringValue(rootElement, "source");
-//        if (StringUtils.isNotBlank(source)) {
-//            metadata.addSource(StringField.newBuilder().setValue(source).build());
-//        }
-//
-//        // Adding title
-//        final String title = createRepeatedField(rootElement, "titles");
-//        if (StringUtils.isBlank(title)) {
-//            context.incrementCounter("filtered", "title_not_found", 1);
-//            return null;
-//        }
-//        metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
-//                .setValue(title)
-//                .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
-//                .build());
-//
-//        // Adding identifier
-//        final String id = getStringValue(rootElement, "id");
-//        String sourceId = null;
-//        if (id != null) {
-//            entity.addOriginalId(id);
-//            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
-//        } else {
-//            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
-//        }
-//        entity.setId(sourceId);
-//
-//        // Adding relevant date
-//        settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
-//
-//        // Adding collectedfrom
-//        final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
-//                .setValue(ORCID)
-//                .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
-//                .build();
-//        entity.addCollectedfrom(collectedFrom);
-//
-//        // Adding type
-//        final String type = getStringValue(rootElement, "type");
-//        String cobjValue = "";
-//        if (StringUtils.isNotBlank(type)) {
-//
-//            metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
-//                    .setClassid(type)
-//                    .setClassname(type)
-//                    .setSchemeid("dnet:dataCite_resource")
-//                    .setSchemename("dnet:dataCite_resource")
-//                    .build());
-//
-//            final String typeValue = typologiesMapping.get(type).get("value");
-//            cobjValue = typologiesMapping.get(type).get("cobj");
-//            final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
-//
-//            // Adding hostedby
-//            instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
-//                    .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
-//                    .setValue("Unknown Repository")
-//                    .build());
-//
-//            // Adding url
-//            final String url = createRepeatedField(rootElement, "urls");
-//            if (StringUtils.isNotBlank(url)) {
-//                instance.addUrl(url);
-//            }
-//
-//            final String pubDate = getPublicationDate(rootElement, "publication_date");
-//            if (StringUtils.isNotBlank(pubDate)) {
-//                instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
-//            }
-//
-//            instance.setCollectedfrom(collectedFrom);
-//
-//            // Adding accessright
-//            instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
-//                    .setClassid("UNKNOWN")
-//                    .setClassname("UNKNOWN")
-//                    .setSchemeid("dnet:access_modes")
-//                    .setSchemename("dnet:access_modes")
-//                    .build());
-//
-//            // Adding type
-//            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
-//                    .setClassid(cobjValue)
-//                    .setClassname(typeValue)
-//                    .setSchemeid("dnet:publication_resource")
-//                    .setSchemename("dnet:publication_resource")
-//                    .build());
-//
-//            result.addInstance(instance);
-//        } else {
-//            context.incrementCounter("filtered", "type_not_found", 1);
-//            return null;
-//        }
-//
-//        // Adding authors
-//        final List<Author> authors = createAuthors(rootElement);
-//        if (authors != null && authors.size() > 0) {
-//            metadata.addAllAuthor(authors);
-//        } else {
-//            context.incrementCounter("filtered", "author_not_found", 1);
-//            return null;
-//        }
-//
-//        metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
-//        result.setMetadata(metadata.build());
-//        entity.setResult(result.build());
-//        oaf.setEntity(entity.build());
-//
-//        final List<AtomicAction> actionList = new ArrayList<>();
-//
-//        actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
-//
-////		 System.out.println(JsonFormat.printToString(oaf.build()));
-//        return actionList;
-//
-//    }
-//
-//    public static List<Author> createAuthors(final JsonObject root) {
-//
-//        final String authorsJSONFieldName = "authors";
-//
-//        if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
-//
-//            final List<Author> authors = new ArrayList<>();
-//            final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
-//            int firstCounter = 0;
-//            int defaultCounter = 0;
-//            int rank = 1;
-//            int currentRank = 0;
-//
-//            for (final JsonElement item : jsonAuthors) {
-//                final JsonObject author = item.getAsJsonObject();
-//                final Author.Builder result = Author.newBuilder();
-//                if (item.isJsonObject()) {
-//                    final String surname = getStringValue(author, "surname");
-//                    final String name = getStringValue(author, "name");
-//                    final String oid = getStringValue(author, "oid");
-//                    final String seq = getStringValue(author, "seq");
-//                    if (StringUtils.isNotBlank(seq)) {
-//                        if (seq.equals("first")) {
-//                            firstCounter += 1;
-//                            rank = firstCounter;
-//
-//                        } else if (seq.equals("additional")) {
-//                            rank = currentRank + 1;
-//                        } else {
-//                            defaultCounter += 1;
-//                            rank = defaultCounter;
-//                        }
-//                    }
-//
-//                    if (StringUtils.isNotBlank(oid)) {
-//                        result.addPid(KeyValue.newBuilder()
-//                                .setValue(oid)
-//                                .setKey("ORCID")
-//                                .build());
-//                        result.setFullname(name + " " + surname);
-//                        if (StringUtils.isNotBlank(name)) {
-//                            result.setName(name);
-//                        }
-//                        if (StringUtils.isNotBlank(surname)) {
-//                            result.setSurname(surname);
-//                        }
-//                    } else {
-//                        String fullname = "";
-//                        if (StringUtils.isNotBlank(name)) {
-//                            fullname = name;
-//                        } else {
-//                            if (StringUtils.isNotBlank(surname)) {
-//                                fullname = surname;
-//                            }
-//                        }
-//                        Person p = new Person(fullname, false);
-//                        if (p.isAccurate()) {
-//                            result.setName(p.getNormalisedFirstName());
-//                            result.setSurname(p.getNormalisedSurname());
-//                            result.setFullname(p.getNormalisedFullname());
-//                        }
-//                        else {
-//                            result.setFullname(fullname);
-//                        }
-//                    }
-//                }
-//                result.setRank(rank);
-//                authors.add(result.build());
-//                currentRank = rank;
-//            }
-//            return authors;
-//
-//        }
-//        return null;
-//    }
-//
-//    private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
-//        String field = "";
-//        if (!rootElement.has(fieldName)) { return null; }
-//        if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
-//        if (rootElement.get(fieldName).isJsonArray()) {
-//            if (!isValidJsonArray(rootElement, fieldName)) { return null; }
-//            final StringBuilder ttl = new StringBuilder();
-//            getArrayValues(rootElement, fieldName).forEach(ttl::append);
-//            field = ttl.toString();
-//        } else {
-//            field = getStringValue(rootElement, fieldName);
-//        }
-//
-//        if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
-//            field = field.substring(1, field.length() - 1);
-//        }
-//        return field;
-//    }
-//
-//    private static void settingRelevantDate(final JsonObject rootElement,
-//                                            final ResultProtos.Result.Metadata.Builder metadata,
-//                                            final String jsonKey,
-//                                            final String dictionaryKey,
-//                                            final boolean addToDateOfAcceptance) {
-//
-//        final String pubDate = getPublicationDate(rootElement, "publication_date");
-//        if (StringUtils.isNotBlank(pubDate)) {
-//            if (addToDateOfAcceptance) {
-//                metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
-//            }
-//            metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
-//                    .setValue(pubDate)
-//                    .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
-//                    .build());
-//        }
-//    }
-//
-//    private static String getPublicationDate(final JsonObject rootElement,
-//                                             final String jsonKey) {
-//
-//        final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
-//        if (pubDateJson == null) { return null; }
-//        final String year = getStringValue(pubDateJson, "year");
-//        final String month = getStringValue(pubDateJson, "month");
-//        final String day = getStringValue(pubDateJson, "day");
-//
-//        if (StringUtils.isBlank(year)) { return null; }
-//        String pubDate = "".concat(year);
-//        if (StringUtils.isNotBlank(month)) {
-//            pubDate = pubDate.concat("-" + month);
-//            if (StringUtils.isNotBlank(day)) {
-//                pubDate = pubDate.concat("-" + day);
-//            } else {
-//                pubDate += "-01";
-//            }
-//        } else {
-//            pubDate += "-01-01";
-//        }
-//        if (isValidDate(pubDate)) { return pubDate; }
-//        return null;
-//    }
-//
-//    protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
-//
-//        final String type = getStringValue(rootElement, "type");
-//        if (!typologiesMapping.containsKey(type)) {
-//            context.incrementCounter("filtered", "unknowntype_" + type, 1);
-//            return false;
-//        }
-//
-//        if (!isValidJsonArray(rootElement, "titles")) {
-//            context.incrementCounter("filtered", "invalid_title", 1);
-//            return false;
-//        }
-//        return true;
-//    }
-//
-//    private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
-//        if (!rootElement.has(fieldName)) { return false; }
-//        final JsonElement jsonElement = rootElement.get(fieldName);
-//        if (jsonElement.isJsonNull()) { return false; }
-//        if (jsonElement.isJsonArray()) {
-//            final JsonArray jsonArray = jsonElement.getAsJsonArray();
-//            if (jsonArray.isJsonNull()) { return false; }
-//            if (jsonArray.get(0).isJsonNull()) { return false; }
-//        }
-//        return true;
-//    }
-//}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
new file mode 100644
index 000000000..c460f6299
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
@@ -0,0 +1,107 @@
+package eu.dnetlib.doiboost.orcidnodoi.util;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonObject;
+import org.apache.commons.lang3.StringUtils;
+
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+public class DumpToActionsUtility {
+
+    private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
+
+    public static String getStringValue(final JsonObject root, final String key) {
+        if (root.has(key) && !root.get(key).isJsonNull())
+            return root.get(key).getAsString();
+        return null;
+    }
+
+    public static List<String> getArrayValues(final JsonObject root, final String key) {
+        if (root.has(key) && root.get(key).isJsonArray()) {
+            final JsonArray asJsonArray = root.get(key).getAsJsonArray();
+            final List<String> result = new ArrayList<>();
+
+
+            asJsonArray.forEach(it -> {
+                if (StringUtils.isNotBlank(it.getAsString())) {
+                    result.add(it.getAsString());
+                }
+            });
+            return result;
+        }
+        return new ArrayList<>();
+    }
+    public static List<JsonObject> getArrayObjects(final JsonObject root, final String key) {
+        if (root.has(key) && root.get(key).isJsonArray()) {
+            final JsonArray asJsonArray = root.get(key).getAsJsonArray();
+            final List<JsonObject> result = new ArrayList<>();
+            asJsonArray.forEach(it -> {
+                if (it.getAsJsonObject() != null)  {
+                    result.add(it.getAsJsonObject());
+                }
+            });
+            return result;
+        }
+        return new ArrayList<>();
+    }
+
+    public static boolean isValidDate(final String date) {
+        return date.matches("\\d{4}-\\d{2}-\\d{2}");
+    }
+
+    public static String now_ISO8601() { // NOPMD
+        String result;
+        synchronized (ISO8601FORMAT) {
+            result = ISO8601FORMAT.format(new Date());
+        }
+        //convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00
+        //- note the added colon for the Timezone
+        return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2);
+    }
+
+    public static String getDefaultResulttype(final String cobjcategory) {
+        switch (cobjcategory) {
+            case "0029":
+                return "software";
+            case "0021":
+            case "0024":
+            case "0025":
+            case "0030":
+                return "dataset";
+            case "0000":
+            case "0010":
+            case "0018":
+            case "0020":
+            case "0022":
+            case "0023":
+            case "0026":
+            case "0027":
+            case "0028":
+            case "0037":
+                return "other";
+            case "0001":
+            case "0002":
+            case "0004":
+            case "0005":
+            case "0006":
+            case "0007":
+            case "0008":
+            case "0009":
+            case "0011":
+            case "0012":
+            case "0013":
+            case "0014":
+            case "0015":
+            case "0016":
+            case "0017":
+            case "0019":
+            case "0031":
+            case "0032":
+                return "publication";
+            default:
+                return "publication";
+        }
+    }
+
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java
new file mode 100644
index 000000000..58c09af60
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java
@@ -0,0 +1,30 @@
+package eu.dnetlib.doiboost.orcidnodoi.util;
+
+public class Pair<K, V> {
+
+    private K k;
+
+    private V v;
+
+    public Pair(K k, V v) {
+        this.k = k;
+        this.v = v;
+    }
+
+    public K getKey() {
+        return k;
+    }
+
+    public V getValue() {
+        return v;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (obj instanceof Pair<?, ?>) {
+            Pair<?, ?> tmp = (Pair<?, ?>) obj;
+            return k.equals(tmp.getKey()) && v.equals(tmp.getValue());
+        } else return false;
+    }
+
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json
new file mode 100644
index 000000000..cb696f279
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json
@@ -0,0 +1,41 @@
+{
+  "reference-entry":      {"cobj":"0013", "value":	"Part of book or chapter of book"},
+  "report":               {"cobj":"0017", "value":	"Report"},
+  "dataset":              {"cobj":"0021", "value":	"Dataset"},
+  "journal-article":      {"cobj":"0001", "value":	"Article"},
+  "reference-book":       {"cobj":"0002", "value":	"Book"},
+  "other":                {"cobj":"0020", "value":	"Other ORP type"},
+  "proceedings-article":  {"cobj":"0004", "value":	"Conference object"},
+  "standard":             {"cobj":"0038", "value":	"Other literature type"},
+  "book-part":            {"cobj":"0002", "value":	"Book"},
+  "monograph":            {"cobj":"0002", "value":	"Book"},
+  "report-series":        {"cobj":"0017", "value":	"Report"},
+  "book":                 {"cobj":"0002", "value":	"Book"},
+  "book-chapter":         {"cobj":"0013", "value":	"Part of book or chapter of book"},
+  "peer-review":          {"cobj":"0015", "value":	"Review"},
+  "book-section":         {"cobj":"0013", "value":	"Part of book or chapter of book"},
+  "book-review":          {"cobj":"0015", "value":	"Review"},
+  "conference-abstract":  {"cobj":"0004", "value":	"Conference object"},
+  "conference-paper":     {"cobj":"0004", "value":	"Conference object"},
+  "conference-poster":    {"cobj":"0004", "value":	"Conference object"},
+  "data-set":             {"cobj":"0021", "value":	"Dataset"},
+  "dictionary-entry":     {"cobj":"0038", "value":	"Other literature type"},
+  "disclosure":           {"cobj":"0038", "value":	"Other literature type"},
+  "dissertation":         {"cobj":"0006", "value":	"Doctoral thesis"},
+  "edited-book":          {"cobj":"0002", "value":	"Book"},
+  "encyclopedia-entry":   {"cobj":"0038", "value":	"Other literature type"},
+  "lecture-speech":       {"cobj":"0010", "value":	"Lecture"},
+  "license":              {"cobj":"0038", "value":	"Other literature type"},
+  "magazine-article":     {"cobj":"0005", "value":	"Contribution for newspaper or weekly magazine"},
+  "manual":               {"cobj":"0038", "value":	"Other literature type"},
+  "newsletter-article":   {"cobj":"0012", "value":	"Newsletter"},
+  "newspaper-article":    {"cobj":"0005", "value":	"Contribution for newspaper or weekly magazine"},
+  "patent":               {"cobj":"0019", "value":	"Patent"},
+  "research-technique":   {"cobj":"0020", "value":	"Other ORP type"},
+  "research-tool":        {"cobj":"0020", "value":	"Other ORP type"},
+  "standards-and-policy": {"cobj":"0038", "value":	"Other literature type"},
+  "supervised-student-publication": {"cobj":"0001", "value":	"Article"},
+  "technical-standard":   {"cobj":"0038", "value":	"Other literature type"},
+  "website":              {"cobj":"0020", "value":	"Other ORP type"},
+  "working-paper":        {"cobj":"0014", "value":	"Research"}
+}
\ No newline at end of file

From 1729cc5cf320c32cdafa2523d884d965ccefdc98 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 2 Jul 2020 18:46:20 +0200
Subject: [PATCH 06/34] publication conversion from json to oaf test

---
 .../orcidnodoi/oaf/OrcidWorkToOAF.java        | 420 ----------------
 .../orcidnodoi/oaf/PublicationToOaf.java      | 456 ++++++++++++++++++
 .../orcidnodoi/util/DumpToActionsUtility.java | 184 +++----
 .../doiboost/orcidnodoi/util/Pair.java        |  40 +-
 .../doiboost/orcid/OrcidClientTest.java       |   2 +-
 .../orcidnodoi/PublicationToOafTest.java      |  76 +++
 .../orcidnodoi/xml/OrcidNoDoiTest.java        |   3 +-
 .../doiboost/orcidnodoi/publication.json      |   1 +
 8 files changed, 650 insertions(+), 532 deletions(-)
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java
deleted file mode 100644
index 673abb407..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java
+++ /dev/null
@@ -1,420 +0,0 @@
-
-package eu.dnetlib.doiboost.orcidnodoi.oaf;
-
-import com.google.gson.Gson;
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
-import eu.dnetlib.dhp.common.PacePerson;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks;
-import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
-import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.*;
-import java.util.stream.Collectors;
-
-import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
-
-public class OrcidWorkToOAF {
-
-    static Logger logger = LoggerFactory.getLogger(OrcidWorkToOAF.class);
-
-    public static final String ORCID = "ORCID";
-    public final static String orcidPREFIX = "orcid_______";
-    public static final String OPENAIRE_PREFIX = "openaire____";
-    public static final String SEPARATOR = "::";
-
-    private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
-
-        {
-            put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
-
-        }
-    };
-
-    // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
-    private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
-
-        {
-            put("ark".toLowerCase(), new Pair<>("ark", "ark"));
-            put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
-            put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
-            put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
-            put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
-            put("urn".toLowerCase(), new Pair<>("urn", "urn"));
-        }
-    };
-
-    static Map<String, Map<String, String>> typologiesMapping;
-
-    static {
-        try {
-            final String tt = IOUtils.toString(OrcidWorkToOAF.class.getResourceAsStream(
-                                            "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
-            typologiesMapping = new Gson().fromJson(tt, Map.class);
-        } catch (final Exception e) {
-            logger.error("loading typologies", e);
-        }
-    }
-
-    public static final String PID_TYPES = "dnet:pid_types";
-
-    public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement, final String setName) {
-
-        if (!isValid(rootElement/*, context*/)) { return null; }
-
-        Publication publication = new Publication();
-
-        final DataInfo dataInfo = new DataInfo();
-        dataInfo.setDeletedbyinference(false);
-        dataInfo.setInferred(false);
-        dataInfo.setTrust("0.9");
-        dataInfo.setProvenanceaction(
-                mapQualifier(
-                        "sysimport:actionset:orcidworks-no-doi",
-                        "sysimport:actionset:orcidworks-no-doi",
-                        "dnet:provenanceActions",
-                    "dnet:provenanceActions"));
-        publication.setDataInfo(dataInfo);
-
-        publication.setLastupdatetimestamp(new Date().getTime());
-
-        publication.setDateofcollection("2019-10-22");
-        publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
-
-        // Adding external ids
-        externalIds.keySet().stream()
-                .forEach(jsonExtId -> {
-                    final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
-                    final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
-                    final String extId = getStringValue(rootElement, jsonExtId);
-                    if (StringUtils.isNotBlank(extId)) {
-                        publication.getExternalReference().add(
-                                convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
-                    }
-                });
-
-        // Adding source
-//        final String source = getStringValue(rootElement, "source");
-//        if (StringUtils.isNotBlank(source)) {
-//            metadata.addSource(StringField.newBuilder().setValue(source).build());
-//        }
-
-        // Adding titles
-        final List<String> titles = createRepeatedField(rootElement, "titles");
-        if (titles==null || titles.isEmpty()) {
-//            context.incrementCounter("filtered", "title_not_found", 1);
-            return null;
-        }
-        Qualifier q = mapQualifier("main title","main title","dnet:dataCite_title","dnet:dataCite_title");
-        publication.setTitle(
-                   titles
-                        .stream()
-                        .map(t -> {
-                            return mapStructuredProperty(t, q, null);
-                        })
-                        .collect(Collectors.toList()));
-        // Adding identifier
-        final String id = getStringValue(rootElement, "id");
-        String sourceId = null;
-        if (id != null) {
-            publication.setOriginalId(Arrays.asList(id));
-            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase()));
-        } else {
-            String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(","));
-            sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase()));
-        }
-        publication.setId(sourceId);
-
-        // Adding relevant date
-        settingRelevantDate(rootElement, publication, "publication_date", "issued", true);
-
-        // Adding collectedfrom
-        publication.setCollectedfrom(Arrays.asList(createCollectedFrom()));
-
-        // Adding type
-        final String type = getStringValue(rootElement, "type");
-        String cobjValue = "";
-        if (StringUtils.isNotBlank(type)) {
-            publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource"));
-
-            final String typeValue = typologiesMapping.get(type).get("value");
-            cobjValue = typologiesMapping.get(type).get("cobj");
-            final Instance instance = new Instance();
-
-            // Adding hostedby
-            instance.setHostedby(createHostedBy());
-
-            // Adding url
-            final List<String> urls = createRepeatedField(rootElement, "urls");
-            if (urls!=null && !urls.isEmpty()) {
-                instance.setUrl(urls);
-            }
-
-            final String pubDate = getPublicationDate(rootElement, "publication_date");
-            if (StringUtils.isNotBlank(pubDate)) {
-                instance.setDateofacceptance(mapStringField(pubDate, null));
-            }
-
-            instance.setCollectedfrom(createCollectedFrom());
-
-            // Adding accessright
-            instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes"));
-
-            // Adding type
-            instance.setInstancetype(mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource"));
-
-            publication.setInstance(Arrays.asList(instance));
-        } else {
-//            context.incrementCounter("filtered", "type_not_found", 1);
-            return null;
-        }
-
-        // Adding authors
-        final List<Author> authors = createAuthors(rootElement);
-        if (authors != null && authors.size() > 0) {
-            publication.setAuthor(authors);
-        } else {
-//            context.incrementCounter("filtered", "author_not_found", 1);
-            return null;
-        }
-        String classValue = getDefaultResulttype(cobjValue);
-        publication.setResulttype(mapQualifier(classValue, classValue,"dnet:result_typologies", "dnet:result_typologies"));
-        return publication;
-    }
-
-    public static List<Author> createAuthors(final JsonObject root) {
-
-        final String authorsJSONFieldName = "authors";
-
-        if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
-
-            final List<Author> authors = new ArrayList<>();
-            final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
-            int firstCounter = 0;
-            int defaultCounter = 0;
-            int rank = 1;
-            int currentRank = 0;
-
-            for (final JsonElement item : jsonAuthors) {
-                final JsonObject jsonAuthor = item.getAsJsonObject();
-                final Author author = new Author();
-                if (item.isJsonObject()) {
-                    final String surname = getStringValue(jsonAuthor, "surname");
-                    final String name = getStringValue(jsonAuthor, "name");
-                    final String oid = getStringValue(jsonAuthor, "oid");
-                    final String seq = getStringValue(jsonAuthor, "seq");
-                    if (StringUtils.isNotBlank(seq)) {
-                        if (seq.equals("first")) {
-                            firstCounter += 1;
-                            rank = firstCounter;
-
-                        } else if (seq.equals("additional")) {
-                            rank = currentRank + 1;
-                        } else {
-                            defaultCounter += 1;
-                            rank = defaultCounter;
-                        }
-                    }
-
-                    if (StringUtils.isNotBlank(oid)) {
-                        author.setPid(Arrays.asList(mapAuthorId(oid)));
-                        author.setFullname(name + " " + surname);
-                        if (StringUtils.isNotBlank(name)) {
-                            author.setName(name);
-                        }
-                        if (StringUtils.isNotBlank(surname)) {
-                            author.setSurname(surname);
-                        }
-                    } else {
-                        String fullname = "";
-                        if (StringUtils.isNotBlank(name)) {
-                            fullname = name;
-                        } else {
-                            if (StringUtils.isNotBlank(surname)) {
-                                fullname = surname;
-                            }
-                        }
-                        PacePerson p = new PacePerson(fullname, false);
-                        if (p.isAccurate()) {
-                            author.setName(p.getNormalisedFirstName());
-                            author.setSurname(p.getNormalisedSurname());
-                            author.setFullname(p.getNormalisedFullname());
-                        }
-                        else {
-                            author.setFullname(fullname);
-                        }
-                    }
-                }
-                author.setRank(rank);
-                authors.add(author);
-                currentRank = rank;
-            }
-            return authors;
-
-        }
-        return null;
-    }
-
-    private static List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
-        if (!rootElement.has(fieldName)) { return null; }
-        if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
-        if (rootElement.get(fieldName).isJsonArray()) {
-            if (!isValidJsonArray(rootElement, fieldName)) { return null; }
-            return  getArrayValues(rootElement, fieldName);
-        } else {
-            String field = getStringValue(rootElement, fieldName);
-            return Arrays.asList(cleanField(field));
-        }
-    }
-
-    private static String cleanField(String value) {
-        if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
-            value = value.substring(1, value.length() - 1);
-        }
-        return value;
-    }
-
-    private static void settingRelevantDate(final JsonObject rootElement,
-                                            final Publication publication,
-                                            final String jsonKey,
-                                            final String dictionaryKey,
-                                            final boolean addToDateOfAcceptance) {
-
-        final String pubDate = getPublicationDate(rootElement, "publication_date");
-        if (StringUtils.isNotBlank(pubDate)) {
-            if (addToDateOfAcceptance) {
-                publication.setDateofacceptance(mapStringField(pubDate, null));
-            }
-            Qualifier q = mapQualifier(dictionaryKey,dictionaryKey,"dnet:dataCite_date","dnet:dataCite_date");
-            publication.setRelevantdate(
-                    Arrays.asList(pubDate)
-                            .stream()
-                            .map(r -> {
-                                return mapStructuredProperty(r, q, null);
-                            })
-                            .collect(Collectors.toList()));
-        }
-    }
-
-    private static String getPublicationDate(final JsonObject rootElement,
-                                             final String jsonKey) {
-
-        final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
-        if (pubDateJson == null) { return null; }
-        final String year = getStringValue(pubDateJson, "year");
-        final String month = getStringValue(pubDateJson, "month");
-        final String day = getStringValue(pubDateJson, "day");
-
-        if (StringUtils.isBlank(year)) { return null; }
-        String pubDate = "".concat(year);
-        if (StringUtils.isNotBlank(month)) {
-            pubDate = pubDate.concat("-" + month);
-            if (StringUtils.isNotBlank(day)) {
-                pubDate = pubDate.concat("-" + day);
-            } else {
-                pubDate += "-01";
-            }
-        } else {
-            pubDate += "-01-01";
-        }
-        if (isValidDate(pubDate)) { return pubDate; }
-        return null;
-    }
-
-    protected static boolean isValid(final JsonObject rootElement/*, final Reporter context*/) {
-
-        final String type = getStringValue(rootElement, "type");
-        if (!typologiesMapping.containsKey(type)) {
-//            context.incrementCounter("filtered", "unknowntype_" + type, 1);
-            return false;
-        }
-
-        if (!isValidJsonArray(rootElement, "titles")) {
-//            context.incrementCounter("filtered", "invalid_title", 1);
-            return false;
-        }
-        return true;
-    }
-
-    private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
-        if (!rootElement.has(fieldName)) { return false; }
-        final JsonElement jsonElement = rootElement.get(fieldName);
-        if (jsonElement.isJsonNull()) { return false; }
-        if (jsonElement.isJsonArray()) {
-            final JsonArray jsonArray = jsonElement.getAsJsonArray();
-            if (jsonArray.isJsonNull()) { return false; }
-            if (jsonArray.get(0).isJsonNull()) { return false; }
-        }
-        return true;
-    }
-
-    private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
-        final Qualifier qualifier = new Qualifier();
-        qualifier.setClassid(classId);
-        qualifier.setClassname(className);
-        qualifier.setSchemeid(schemeId);
-        qualifier.setSchemename(schemeName);
-        return qualifier;
-    }
-
-    private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) {
-        ExternalReference ex = new ExternalReference();
-        ex.setRefidentifier(extId);
-        ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName ));
-        return ex;
-    }
-
-    private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
-        if (value == null | StringUtils.isBlank(value)) {
-            return null;
-        }
-
-        final StructuredProperty structuredProperty = new StructuredProperty();
-        structuredProperty.setValue(value);
-        structuredProperty.setQualifier(qualifier);
-        structuredProperty.setDataInfo(dataInfo);
-        return structuredProperty;
-    }
-
-    private static Field<String> mapStringField(String value, DataInfo dataInfo) {
-        if (value == null || StringUtils.isBlank(value)) {
-            return null;
-        }
-
-        final Field<String> stringField = new Field<>();
-        stringField.setValue(value);
-        stringField.setDataInfo(dataInfo);
-        return stringField;
-    }
-
-    private static KeyValue createCollectedFrom() {
-        KeyValue cf = new KeyValue();
-        cf.setValue(ORCID);
-        cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
-        return cf;
-    }
-
-    private static KeyValue createHostedBy() {
-        KeyValue hb = new KeyValue();
-        hb.setValue("Unknown Repository");
-        hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
-        return hb;
-    }
-
-    private static StructuredProperty mapAuthorId(String orcidId) {
-        final StructuredProperty sp = new StructuredProperty();
-        sp.setValue(orcidId);
-        final Qualifier q = new Qualifier();
-        q.setClassid("ORCID");
-        q.setClassname("ORCID");
-        sp.setQualifier(q);
-        return sp;
-    }
-}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
new file mode 100644
index 000000000..dc03767ec
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -0,0 +1,456 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.oaf;
+
+import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+
+import eu.dnetlib.dhp.common.PacePerson;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
+import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
+
+public class PublicationToOaf {
+
+	static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
+
+	public static final String ORCID = "ORCID";
+	public final static String orcidPREFIX = "orcid_______";
+	public static final String OPENAIRE_PREFIX = "openaire____";
+	public static final String SEPARATOR = "::";
+
+	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
+
+		{
+			put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
+
+		}
+	};
+
+	// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
+	private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
+
+		{
+			put("ark".toLowerCase(), new Pair<>("ark", "ark"));
+			put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
+			put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
+			put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
+			put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
+			put("urn".toLowerCase(), new Pair<>("urn", "urn"));
+		}
+	};
+
+	static Map<String, Map<String, String>> typologiesMapping;
+
+	static {
+		try {
+			final String tt = IOUtils
+				.toString(
+					PublicationToOaf.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
+			typologiesMapping = new Gson().fromJson(tt, Map.class);
+		} catch (final Exception e) {
+			logger.error("loading typologies", e);
+		}
+	}
+
+	public static final String PID_TYPES = "dnet:pid_types";
+
+	public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
+
+		logger.debug("generatePublicationActionsFromDump ...");
+		if (!isValid(rootElement/* , context */)) {
+			logger.error("publication not valid");
+			return null;
+		}
+
+		Publication publication = new Publication();
+
+		final DataInfo dataInfo = new DataInfo();
+		dataInfo.setDeletedbyinference(false);
+		dataInfo.setInferred(false);
+		dataInfo.setTrust("0.9");
+		dataInfo
+			.setProvenanceaction(
+				mapQualifier(
+					"sysimport:actionset:orcidworks-no-doi",
+					"sysimport:actionset:orcidworks-no-doi",
+					"dnet:provenanceActions",
+					"dnet:provenanceActions"));
+		publication.setDataInfo(dataInfo);
+
+		publication.setLastupdatetimestamp(new Date().getTime());
+
+		publication.setDateofcollection("2019-10-22");
+		publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
+
+		// Adding external ids
+		externalIds
+			.keySet()
+			.stream()
+			.forEach(jsonExtId -> {
+				final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
+				final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
+				final String extId = getStringValue(rootElement, jsonExtId);
+				if (StringUtils.isNotBlank(extId)) {
+					publication
+						.getExternalReference()
+						.add(
+							convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
+				}
+			});
+
+		// Adding source
+		final String source = getStringValue(rootElement, "sourceName");
+		if (StringUtils.isNotBlank(source)) {
+			publication.setSource(Arrays.asList(mapStringField(source, null)));
+		}
+
+		// Adding titles
+		final List<String> titles = createRepeatedField(rootElement, "titles");
+		if (titles == null || titles.isEmpty()) {
+			logger.error("titles not found");
+//            context.incrementCounter("filtered", "title_not_found", 1);
+			return null;
+		}
+		Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
+		publication
+			.setTitle(
+				titles
+					.stream()
+					.map(t -> {
+						return mapStructuredProperty(t, q, null);
+					})
+					.collect(Collectors.toList()));
+		// Adding identifier
+		final String id = getStringValue(rootElement, "id");
+		String sourceId = null;
+		if (id != null) {
+			publication.setOriginalId(Arrays.asList(id));
+			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase()));
+		} else {
+			String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(","));
+			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase()));
+		}
+		publication.setId(sourceId);
+
+		// Adding relevant date
+		settingRelevantDate(rootElement, publication, "publication_date", "issued", true);
+
+		// Adding collectedfrom
+		publication.setCollectedfrom(Arrays.asList(createCollectedFrom()));
+
+		// Adding type
+		final String type = getStringValue(rootElement, "type");
+		String cobjValue = "";
+		if (StringUtils.isNotBlank(type)) {
+			publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource"));
+
+			final String typeValue = typologiesMapping.get(type).get("value");
+			cobjValue = typologiesMapping.get(type).get("cobj");
+			final Instance instance = new Instance();
+
+			// Adding hostedby
+			instance.setHostedby(createHostedBy());
+
+			// Adding url
+			final List<String> urls = createRepeatedField(rootElement, "urls");
+			if (urls != null && !urls.isEmpty()) {
+				instance.setUrl(urls);
+			}
+
+			final String pubDate = getPublicationDate(rootElement, "publication_date");
+			if (StringUtils.isNotBlank(pubDate)) {
+				instance.setDateofacceptance(mapStringField(pubDate, null));
+			}
+
+			instance.setCollectedfrom(createCollectedFrom());
+
+			// Adding accessright
+			instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes"));
+
+			// Adding type
+			instance
+				.setInstancetype(
+					mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource"));
+
+			publication.setInstance(Arrays.asList(instance));
+		} else {
+			logger.error("type not found");
+//            context.incrementCounter("filtered", "type_not_found", 1);
+			return null;
+		}
+
+		// Adding authors
+		final List<Author> authors = createAuthors(rootElement);
+		if (authors != null && authors.size() > 0) {
+			publication.setAuthor(authors);
+		} else {
+			logger.error("authors not found");
+//            context.incrementCounter("filtered", "author_not_found", 1);
+			return null;
+		}
+		String classValue = getDefaultResulttype(cobjValue);
+		publication
+			.setResulttype(mapQualifier(classValue, classValue, "dnet:result_typologies", "dnet:result_typologies"));
+		return publication;
+	}
+
+	public static List<Author> createAuthors(final JsonObject root) {
+
+		final String authorsJSONFieldName = "contributors";
+
+		if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
+
+			final List<Author> authors = new ArrayList<>();
+			final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
+			int firstCounter = 0;
+			int defaultCounter = 0;
+			int rank = 1;
+			int currentRank = 0;
+
+			for (final JsonElement item : jsonAuthors) {
+				final JsonObject jsonAuthor = item.getAsJsonObject();
+				final Author author = new Author();
+				if (item.isJsonObject()) {
+					final String creditname = getStringValue(jsonAuthor, "creditName");
+					final String surname = getStringValue(jsonAuthor, "surname");
+					final String name = getStringValue(jsonAuthor, "name");
+					final String oid = getStringValue(jsonAuthor, "oid");
+					final String seq = getStringValue(jsonAuthor, "sequence");
+					if (StringUtils.isNotBlank(seq)) {
+						if (seq.equals("first")) {
+							firstCounter += 1;
+							rank = firstCounter;
+
+						} else if (seq.equals("additional")) {
+							rank = currentRank + 1;
+						} else {
+							defaultCounter += 1;
+							rank = defaultCounter;
+						}
+					}
+					if (StringUtils.isNotBlank(oid)) {
+						author.setPid(Arrays.asList(mapAuthorId(oid)));
+						author.setFullname(name + " " + surname);
+						if (StringUtils.isNotBlank(name)) {
+							author.setName(name);
+						}
+						if (StringUtils.isNotBlank(surname)) {
+							author.setSurname(surname);
+						}
+					} else {
+						PacePerson p = new PacePerson(creditname, false);
+						if (p.isAccurate()) {
+							author.setName(p.getNormalisedFirstName());
+							author.setSurname(p.getNormalisedSurname());
+							author.setFullname(p.getNormalisedFullname());
+						} else {
+							author.setFullname(creditname);
+						}
+					}
+				}
+				author.setRank(rank);
+				authors.add(author);
+				currentRank = rank;
+			}
+			return authors;
+
+		}
+		return null;
+	}
+
+	private static List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
+		if (!rootElement.has(fieldName)) {
+			return null;
+		}
+		if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) {
+			return null;
+		}
+		if (rootElement.get(fieldName).isJsonArray()) {
+			if (!isValidJsonArray(rootElement, fieldName)) {
+				return null;
+			}
+			return getArrayValues(rootElement, fieldName);
+		} else {
+			String field = getStringValue(rootElement, fieldName);
+			return Arrays.asList(cleanField(field));
+		}
+	}
+
+	private static String cleanField(String value) {
+		if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
+			value = value.substring(1, value.length() - 1);
+		}
+		return value;
+	}
+
+	private static void settingRelevantDate(final JsonObject rootElement,
+		final Publication publication,
+		final String jsonKey,
+		final String dictionaryKey,
+		final boolean addToDateOfAcceptance) {
+
+		final String pubDate = getPublicationDate(rootElement, "publication_date");
+		if (StringUtils.isNotBlank(pubDate)) {
+			if (addToDateOfAcceptance) {
+				publication.setDateofacceptance(mapStringField(pubDate, null));
+			}
+			Qualifier q = mapQualifier(dictionaryKey, dictionaryKey, "dnet:dataCite_date", "dnet:dataCite_date");
+			publication
+				.setRelevantdate(
+					Arrays
+						.asList(pubDate)
+						.stream()
+						.map(r -> {
+							return mapStructuredProperty(r, q, null);
+						})
+						.collect(Collectors.toList()));
+		}
+	}
+
+	private static String getPublicationDate(final JsonObject rootElement,
+		final String jsonKey) {
+
+		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
+		if (pubDateJson == null) {
+			return null;
+		}
+		final String year = getStringValue(pubDateJson, "year");
+		final String month = getStringValue(pubDateJson, "month");
+		final String day = getStringValue(pubDateJson, "day");
+
+		if (StringUtils.isBlank(year)) {
+			return null;
+		}
+		String pubDate = "".concat(year);
+		if (StringUtils.isNotBlank(month)) {
+			pubDate = pubDate.concat("-" + month);
+			if (StringUtils.isNotBlank(day)) {
+				pubDate = pubDate.concat("-" + day);
+			} else {
+				pubDate += "-01";
+			}
+		} else {
+			pubDate += "-01-01";
+		}
+		if (isValidDate(pubDate)) {
+			return pubDate;
+		}
+		return null;
+	}
+
+	protected static boolean isValid(final JsonObject rootElement/* , final Reporter context */) {
+
+		final String type = getStringValue(rootElement, "type");
+		if (!typologiesMapping.containsKey(type)) {
+			logger.error("unknowntype_" + type);
+//            context.incrementCounter("filtered", "unknowntype_" + type, 1);
+			return false;
+		}
+
+		if (!isValidJsonArray(rootElement, "titles")) {
+			logger.error("invalid_title");
+//            context.incrementCounter("filtered", "invalid_title", 1);
+			return false;
+		}
+		return true;
+	}
+
+	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
+		if (!rootElement.has(fieldName)) {
+			return false;
+		}
+		final JsonElement jsonElement = rootElement.get(fieldName);
+		if (jsonElement.isJsonNull()) {
+			return false;
+		}
+		if (jsonElement.isJsonArray()) {
+			final JsonArray jsonArray = jsonElement.getAsJsonArray();
+			if (jsonArray.isJsonNull()) {
+				return false;
+			}
+			if (jsonArray.get(0).isJsonNull()) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
+		final Qualifier qualifier = new Qualifier();
+		qualifier.setClassid(classId);
+		qualifier.setClassname(className);
+		qualifier.setSchemeid(schemeId);
+		qualifier.setSchemename(schemeName);
+		return qualifier;
+	}
+
+	private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId,
+		String schemeName) {
+		ExternalReference ex = new ExternalReference();
+		ex.setRefidentifier(extId);
+		ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName));
+		return ex;
+	}
+
+	private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
+		if (value == null | StringUtils.isBlank(value)) {
+			return null;
+		}
+
+		final StructuredProperty structuredProperty = new StructuredProperty();
+		structuredProperty.setValue(value);
+		structuredProperty.setQualifier(qualifier);
+		structuredProperty.setDataInfo(dataInfo);
+		return structuredProperty;
+	}
+
+	private static Field<String> mapStringField(String value, DataInfo dataInfo) {
+		if (value == null || StringUtils.isBlank(value)) {
+			return null;
+		}
+
+		final Field<String> stringField = new Field<>();
+		stringField.setValue(value);
+		stringField.setDataInfo(dataInfo);
+		return stringField;
+	}
+
+	private static KeyValue createCollectedFrom() {
+		KeyValue cf = new KeyValue();
+		cf.setValue(ORCID);
+		cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
+		return cf;
+	}
+
+	private static KeyValue createHostedBy() {
+		KeyValue hb = new KeyValue();
+		hb.setValue("Unknown Repository");
+		hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
+		return hb;
+	}
+
+	private static StructuredProperty mapAuthorId(String orcidId) {
+		final StructuredProperty sp = new StructuredProperty();
+		sp.setValue(orcidId);
+		final Qualifier q = new Qualifier();
+		q.setClassid("ORCID");
+		q.setClassname("ORCID");
+		sp.setQualifier(q);
+		return sp;
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
index c460f6299..9b9f3c8b2 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
@@ -1,107 +1,109 @@
-package eu.dnetlib.doiboost.orcidnodoi.util;
 
-import com.google.gson.JsonArray;
-import com.google.gson.JsonObject;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.doiboost.orcidnodoi.util;
 
 import java.text.SimpleDateFormat;
 import java.util.*;
 
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonObject;
+
 public class DumpToActionsUtility {
 
-    private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
+	private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
 
-    public static String getStringValue(final JsonObject root, final String key) {
-        if (root.has(key) && !root.get(key).isJsonNull())
-            return root.get(key).getAsString();
-        return null;
-    }
+	public static String getStringValue(final JsonObject root, final String key) {
+		if (root.has(key) && !root.get(key).isJsonNull())
+			return root.get(key).getAsString();
+		return null;
+	}
 
-    public static List<String> getArrayValues(final JsonObject root, final String key) {
-        if (root.has(key) && root.get(key).isJsonArray()) {
-            final JsonArray asJsonArray = root.get(key).getAsJsonArray();
-            final List<String> result = new ArrayList<>();
+	public static List<String> getArrayValues(final JsonObject root, final String key) {
+		if (root.has(key) && root.get(key).isJsonArray()) {
+			final JsonArray asJsonArray = root.get(key).getAsJsonArray();
+			final List<String> result = new ArrayList<>();
 
+			asJsonArray.forEach(it -> {
+				if (StringUtils.isNotBlank(it.getAsString())) {
+					result.add(it.getAsString());
+				}
+			});
+			return result;
+		}
+		return new ArrayList<>();
+	}
 
-            asJsonArray.forEach(it -> {
-                if (StringUtils.isNotBlank(it.getAsString())) {
-                    result.add(it.getAsString());
-                }
-            });
-            return result;
-        }
-        return new ArrayList<>();
-    }
-    public static List<JsonObject> getArrayObjects(final JsonObject root, final String key) {
-        if (root.has(key) && root.get(key).isJsonArray()) {
-            final JsonArray asJsonArray = root.get(key).getAsJsonArray();
-            final List<JsonObject> result = new ArrayList<>();
-            asJsonArray.forEach(it -> {
-                if (it.getAsJsonObject() != null)  {
-                    result.add(it.getAsJsonObject());
-                }
-            });
-            return result;
-        }
-        return new ArrayList<>();
-    }
+	public static List<JsonObject> getArrayObjects(final JsonObject root, final String key) {
+		if (root.has(key) && root.get(key).isJsonArray()) {
+			final JsonArray asJsonArray = root.get(key).getAsJsonArray();
+			final List<JsonObject> result = new ArrayList<>();
+			asJsonArray.forEach(it -> {
+				if (it.getAsJsonObject() != null) {
+					result.add(it.getAsJsonObject());
+				}
+			});
+			return result;
+		}
+		return new ArrayList<>();
+	}
 
-    public static boolean isValidDate(final String date) {
-        return date.matches("\\d{4}-\\d{2}-\\d{2}");
-    }
+	public static boolean isValidDate(final String date) {
+		return date.matches("\\d{4}-\\d{2}-\\d{2}");
+	}
 
-    public static String now_ISO8601() { // NOPMD
-        String result;
-        synchronized (ISO8601FORMAT) {
-            result = ISO8601FORMAT.format(new Date());
-        }
-        //convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00
-        //- note the added colon for the Timezone
-        return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2);
-    }
+	public static String now_ISO8601() { // NOPMD
+		String result;
+		synchronized (ISO8601FORMAT) {
+			result = ISO8601FORMAT.format(new Date());
+		}
+		// convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00
+		// - note the added colon for the Timezone
+		return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2);
+	}
 
-    public static String getDefaultResulttype(final String cobjcategory) {
-        switch (cobjcategory) {
-            case "0029":
-                return "software";
-            case "0021":
-            case "0024":
-            case "0025":
-            case "0030":
-                return "dataset";
-            case "0000":
-            case "0010":
-            case "0018":
-            case "0020":
-            case "0022":
-            case "0023":
-            case "0026":
-            case "0027":
-            case "0028":
-            case "0037":
-                return "other";
-            case "0001":
-            case "0002":
-            case "0004":
-            case "0005":
-            case "0006":
-            case "0007":
-            case "0008":
-            case "0009":
-            case "0011":
-            case "0012":
-            case "0013":
-            case "0014":
-            case "0015":
-            case "0016":
-            case "0017":
-            case "0019":
-            case "0031":
-            case "0032":
-                return "publication";
-            default:
-                return "publication";
-        }
-    }
+	public static String getDefaultResulttype(final String cobjcategory) {
+		switch (cobjcategory) {
+			case "0029":
+				return "software";
+			case "0021":
+			case "0024":
+			case "0025":
+			case "0030":
+				return "dataset";
+			case "0000":
+			case "0010":
+			case "0018":
+			case "0020":
+			case "0022":
+			case "0023":
+			case "0026":
+			case "0027":
+			case "0028":
+			case "0037":
+				return "other";
+			case "0001":
+			case "0002":
+			case "0004":
+			case "0005":
+			case "0006":
+			case "0007":
+			case "0008":
+			case "0009":
+			case "0011":
+			case "0012":
+			case "0013":
+			case "0014":
+			case "0015":
+			case "0016":
+			case "0017":
+			case "0019":
+			case "0031":
+			case "0032":
+				return "publication";
+			default:
+				return "publication";
+		}
+	}
 
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java
index 58c09af60..8883d00f5 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java
@@ -1,30 +1,32 @@
+
 package eu.dnetlib.doiboost.orcidnodoi.util;
 
 public class Pair<K, V> {
 
-    private K k;
+	private K k;
 
-    private V v;
+	private V v;
 
-    public Pair(K k, V v) {
-        this.k = k;
-        this.v = v;
-    }
+	public Pair(K k, V v) {
+		this.k = k;
+		this.v = v;
+	}
 
-    public K getKey() {
-        return k;
-    }
+	public K getKey() {
+		return k;
+	}
 
-    public V getValue() {
-        return v;
-    }
+	public V getValue() {
+		return v;
+	}
 
-    @Override
-    public boolean equals(Object obj) {
-        if (obj instanceof Pair<?, ?>) {
-            Pair<?, ?> tmp = (Pair<?, ?>) obj;
-            return k.equals(tmp.getKey()) && v.equals(tmp.getValue());
-        } else return false;
-    }
+	@Override
+	public boolean equals(Object obj) {
+		if (obj instanceof Pair<?, ?>) {
+			Pair<?, ?> tmp = (Pair<?, ?>) obj;
+			return k.equals(tmp.getKey()) && v.equals(tmp.getValue());
+		} else
+			return false;
+	}
 
 }
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index 75f857ca4..8b50f2d8f 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -54,7 +54,7 @@ public class OrcidClientTest {
 	}
 
 //	@Test
-	public void testLambdaFileParser() throws Exception {
+	private void testLambdaFileParser() throws Exception {
 		try (BufferedReader br = new BufferedReader(
 			new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) {
 			String line;
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
new file mode 100644
index 000000000..4d04e1a16
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
@@ -0,0 +1,76 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
+
+public class PublicationToOafTest {
+
+	private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
+
+	@Test
+//	@Ignore
+	public void convertOafPublicationTest() throws Exception {
+		String jsonPublication = IOUtils
+			.toString(
+				PublicationToOafTest.class.getResourceAsStream("publication.json"));
+		JsonElement j = new JsonParser().parse(jsonPublication);
+		logger.info("json publication loaded: " + j.toString());
+		Publication oafPublication = (Publication) PublicationToOaf
+			.generatePublicationActionsFromDump(j.getAsJsonObject());
+		assertNotNull(oafPublication.getId());
+		assertNotNull(oafPublication.getOriginalId());
+		assertEquals(oafPublication.getOriginalId().get(0), "60153327");
+		logger.info("oafPublication.getId(): " + oafPublication.getId());
+		assertEquals(
+			oafPublication.getTitle().get(0).getValue(),
+			"Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp");
+		assertNotNull(oafPublication.getLastupdatetimestamp());
+		assertNotNull(oafPublication.getDateofcollection());
+		assertNotNull(oafPublication.getDateoftransformation());
+		assertTrue(oafPublication.getAuthor().size() == 7);
+		oafPublication.getAuthor().forEach(a -> {
+			assertNotNull(a.getFullname());
+			assertNotNull(a.getRank());
+			logger.info("a.getFullname(): " + a.getFullname());
+			if (a.getName() != null) {
+				logger.info("a.getName(): " + a.getName());
+			}
+			if (a.getSurname() != null) {
+				logger.info("a.getSurname(): " + a.getSurname());
+			}
+			logger.info("a.getRank(): " + a.getRank());
+			if (a.getPid() != null) {
+				logger.info("a.getPid(): " + a.getPid().get(0).getValue());
+			}
+
+		});
+		assertNotNull(oafPublication.getCollectedfrom());
+		if (oafPublication.getSource() != null) {
+			logger.info((oafPublication.getSource().get(0).getValue()));
+		}
+		if (oafPublication.getExternalReference() != null) {
+			oafPublication.getExternalReference().forEach(e -> {
+				assertNotNull(e.getRefidentifier());
+				assertEquals(e.getQualifier().getSchemeid(), "dnet:pid_types");
+			});
+		}
+		assertNotNull(oafPublication.getInstance());
+		oafPublication.getInstance().forEach(i -> {
+			assertNotNull(i.getInstancetype().getClassid());
+			logger.info("i.getInstancetype().getClassid(): " + i.getInstancetype().getClassid());
+			assertNotNull(i.getInstancetype().getClassname());
+			logger.info("i.getInstancetype().getClassname(): " + i.getInstancetype().getClassname());
+		});
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index 6a5faddbd..d426b01f1 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -95,7 +95,8 @@ public class OrcidNoDoiTest {
 	}
 
 	@Test
-	public void authorMatchTest() throws Exception {
+	@Ignore
+	private void authorMatchTest() throws Exception {
 		logger.info("running authorSimpleMatchTest ....");
 		String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
 		AuthorData author = new AuthorData();
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json
new file mode 100644
index 000000000..579e12f2e
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/publication.json
@@ -0,0 +1 @@
+{"oid":"0000-0002-4147-3387","id":"60153327","sourceName":"The Chinese University of Hong Kong","type":"conference-paper","titles":["Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp"],"extIds":[{"type":"wosuid","value":"000425015800225","relationShip":"self"},{"type":"other-id","value":"441f521e-ab19-448d-ba32-83157b348ada","relationShip":"self"}],"publicationDates":[],"contributors":[{"sequence":"1","oid":"0000-0002-4147-3387","name":"Elaine","surname":"Chow","creditName":"Elaine Chow"},{"sequence":"2","creditName":"Victor Tsui"},{"sequence":"3","creditName":"Achim Müller"},{"sequence":"4","creditName":"Vincy Lee"},{"sequence":"5","creditName":"Lucia Krivánekova"},{"sequence":"6","creditName":"Roland Krivánek"},{"sequence":"7","creditName":"Juliana CN Chan"}]}
\ No newline at end of file

From ca37d3427bc4bfe05932c9231e11ccdfb98752f2 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Fri, 3 Jul 2020 23:30:31 +0200
Subject: [PATCH 07/34] separate workflow to parse orcid summaries, activities
 and generate dataset with no doi publications; test

---
 .../orcid/OrcidAuthorsDOIsDataGen.java        |   8 +-
 .../doiboost/orcid/OrcidDSManager.java        |  14 +-
 .../doiboost/orcid/OrcidDownloader.java       |   8 +-
 .../orcidnodoi/ActivitiesDumpReader.java      |   6 +-
 .../orcidnodoi/GenOrcidAuthorWork.java        |   3 +-
 .../SparkGenEnrichedOrcidWorks.java           |  18 +-
 .../orcidnodoi/oaf/PublicationToOaf.java      |   9 +-
 .../doiboost/create_orcid_authors_data.json   |   2 +-
 .../create_orcid_authors_dois_data.json       |   2 +-
 .../dhp/doiboost/download_orcid_data.json     |   2 +-
 .../oozie_app/workflow.xml                    | 497 +----------------
 .../dhp/doiboost/orcid/oozie_app/workflow.xml |  44 +-
 .../oozie_app/config-default.xml              |  31 ++
 .../orcid_activities/oozie_app/workflow.xml   | 514 ++++++++++++++++++
 .../oozie_app/config-default.xml              |  22 +
 .../orcid_summaries/oozie_app/workflow.xml    |  68 +++
 .../doiboost/orcid/OrcidClientTest.java       |  29 +-
 .../orcidnodoi/PublicationToOafTest.java      |   5 +-
 .../orcidnodoi/xml/OrcidNoDoiTest.java        |   4 +-
 .../xml/activity_work_0000-0002-2536-4498.xml |  72 +++
 20 files changed, 815 insertions(+), 543 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java
index 70528a8f6..2ec4fe59d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java
@@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
 	public void generateAuthorsDOIsData() throws Exception {
 		Configuration conf = initConfigurationObject();
 		FileSystem fs = initFileSystemObject(conf);
-		String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
-		Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
+		Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
 		ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
 	}
 
@@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
 
 		hdfsServerUri = parser.get("hdfsServerUri");
 		Log.info("HDFS URI: " + hdfsServerUri);
-		hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
-		Log.info("Default Path: " + hdfsOrcidDefaultPath);
+		workingPath = parser.get("workingPath");
+		Log.info("Default Path: " + workingPath);
 		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
 		Log.info("Activities File Name: " + activitiesFileNameTarGz);
 		outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
index 4f846bdf3..aa61c0117 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class OrcidDSManager {
 
 	protected String hdfsServerUri;
-	protected String hdfsOrcidDefaultPath;
+	protected String workingPath;
 	private String summariesFileNameTarGz;
 	private String outputAuthorsPath;
 
@@ -28,10 +28,10 @@ public class OrcidDSManager {
 	public void generateAuthors() throws Exception {
 		Configuration conf = initConfigurationObject();
 		FileSystem fs = initFileSystemObject(conf);
-		String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
 		Path outputPath = new Path(
 			hdfsServerUri
-				.concat(hdfsOrcidDefaultPath)
+				.concat(workingPath)
 				.concat(outputAuthorsPath)
 				.concat("authors.seq"));
 		SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
@@ -41,7 +41,7 @@ public class OrcidDSManager {
 		// ====== Init HDFS File System Object
 		Configuration conf = new Configuration();
 		// Set FileSystem URI
-		conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
+		conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
 		// Because of Maven
 		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
@@ -52,7 +52,7 @@ public class OrcidDSManager {
 		// Get the filesystem - HDFS
 		FileSystem fs = null;
 		try {
-			fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
+			fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
@@ -71,8 +71,8 @@ public class OrcidDSManager {
 
 		hdfsServerUri = parser.get("hdfsServerUri");
 		Log.info("HDFS URI: " + hdfsServerUri);
-		hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
-		Log.info("Default Path: " + hdfsOrcidDefaultPath);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
 		summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
 		Log.info("Summaries File Name: " + summariesFileNameTarGz);
 		outputAuthorsPath = parser.get("outputAuthorsPath");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
index 2e1a199da..762d8aecd 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
@@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager {
 		long startDownload = 0;
 		Configuration conf = initConfigurationObject();
 		FileSystem fs = initFileSystemObject(conf);
-		String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
+		String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
 		Path hdfsreadpath = new Path(lambdaFileUri);
 		FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
 		Path hdfsoutputPath = new Path(
 			hdfsServerUri
-				.concat(hdfsOrcidDefaultPath)
+				.concat(workingPath)
 				.concat(outputPath)
 				.concat("orcid_records.seq"));
 
@@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager {
 
 		hdfsServerUri = parser.get("hdfsServerUri");
 		Log.info("HDFS URI: " + hdfsServerUri);
-		hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
-		Log.info("Default Path: " + hdfsOrcidDefaultPath);
+		workingPath = parser.get("workingPath");
+		Log.info("Default Path: " + workingPath);
 		lambdaFileName = parser.get("lambdaFileName");
 		Log.info("Lambda File Name: " + lambdaFileName);
 		outputPath = parser.get("outputPath");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index bf63568d8..807f52972 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
 
 public class ActivitiesDumpReader {
 
-	private static final int MAX_XML_WORKS_PARSED = 100;
-	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10;
+	private static final int MAX_XML_WORKS_PARSED = -1;
+	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
 
 	public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
 		throws Exception {
@@ -127,7 +127,7 @@ public class ActivitiesDumpReader {
 						Log
 							.warn(
 								"Parsing work from tar archive and xml work: " + filename + "  " + e.getMessage());
-						Log.warn(e);
+//						Log.warn(e);
 					}
 
 					if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
index 8dcee796c..041424ba9 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
 
 	private String activitiesFileNameTarGz;
 	private String outputWorksPath;
-	private String workingPath;
+//	private String workingPath;
 
 	public static void main(String[] args) throws IOException, Exception {
 		GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
@@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
 		Log.info("HDFS URI: " + hdfsServerUri);
 		workingPath = parser.get("workingPath");
 		Log.info("Working Path: " + workingPath);
-		hdfsOrcidDefaultPath = workingPath;
 		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
 		Log.info("Activities File Name: " + activitiesFileNameTarGz);
 		outputWorksPath = parser.get("outputWorksPath");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index ae1e4dae6..b0b989463 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -24,9 +25,11 @@ import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.doiboost.orcid.json.JsonHelper;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
 import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
 import scala.Tuple2;
 
@@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks {
 				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
 				JavaPairRDD<Text, Text> summariesRDD = sc
-					.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
+					.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
 				Dataset<AuthorData> summariesDataset = spark
 					.createDataset(
 						summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks {
 					.filter(Objects::nonNull)
 					.toJavaRDD();
 				logger.info("Works enriched data created: " + enrichedWorksRDD.count());
-				enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath);
+				enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
 				logger.info("Works enriched data saved");
+				JavaRDD<Tuple2<String, Publication>> oafPublicationRDD = enrichedWorksRDD.map(e -> {
+					JsonElement j = new JsonParser().parse(e._2());
+					return new Tuple2<>(e._1(), (Publication) PublicationToOaf
+						.generatePublicationActionsFromDump(j.getAsJsonObject()));
+				});
+
+				Dataset<Tuple2<String, Publication>> publicationDataset = spark
+					.createDataset(
+						oafPublicationRDD.repartition(1).rdd(),
+						Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class)));
+				publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
 			});
 	}
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index dc03767ec..19bfe0f30 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -172,7 +172,7 @@ public class PublicationToOaf {
 				instance.setUrl(urls);
 			}
 
-			final String pubDate = getPublicationDate(rootElement, "publication_date");
+			final String pubDate = getPublicationDate(rootElement, "publicationDates");
 			if (StringUtils.isNotBlank(pubDate)) {
 				instance.setDateofacceptance(mapStringField(pubDate, null));
 			}
@@ -325,7 +325,12 @@ public class PublicationToOaf {
 	private static String getPublicationDate(final JsonObject rootElement,
 		final String jsonKey) {
 
-		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
+		JsonObject pubDateJson = null;
+		try {
+			pubDateJson = rootElement.getAsJsonObject(jsonKey);
+		} catch (Exception e) {
+			return null;
+		}
 		if (pubDateJson == null) {
 			return null;
 		}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json
index bf992b508..6f213e415 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json
@@ -1,6 +1,6 @@
 [
   {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
-  {"paramName":"d",   "paramLongName":"hdfsOrcidDefaultPath",	"paramDescription": "the default work path",	"paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
   {"paramName":"f",   "paramLongName":"summariesFileNameTarGz",	"paramDescription": "the name of the summaries orcid file",	"paramRequired": true},
   {"paramName":"o",   "paramLongName":"outputAuthorsPath",	"paramDescription": "the relative folder of the sequencial file to write",	"paramRequired": true}
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json
index 131c30125..b2f0fdeda 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json
@@ -1,6 +1,6 @@
 [
   {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
-  {"paramName":"d",   "paramLongName":"hdfsOrcidDefaultPath",	"paramDescription": "the default work path",	"paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
   {"paramName":"f",   "paramLongName":"activitiesFileNameTarGz",	"paramDescription": "the name of the activities orcid file",	"paramRequired": true},
   {"paramName":"o",   "paramLongName":"outputAuthorsDOIsPath",	"paramDescription": "the relative folder of the sequencial file to write",	"paramRequired": true}
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json
index 444e487f7..8c69b168b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json
@@ -1,6 +1,6 @@
 [
   {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
-  {"paramName":"d",   "paramLongName":"hdfsOrcidDefaultPath",	"paramDescription": "the default work path",	"paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
   {"paramName":"f",   "paramLongName":"lambdaFileName",	"paramDescription": "the name of the lambda file",	"paramRequired": true},
   {"paramName":"o",   "paramLongName":"outputPath",	"paramDescription": "the relative folder of the sequencial file to write",	"paramRequired": true},
   {"paramName":"t",   "paramLongName":"token",	"paramDescription": "token to grant access",	"paramRequired": true}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
index df5e0e76f..a60af8b45 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
@@ -1,75 +1,9 @@
 <workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
     <parameters>
         <property>
-            <name>workingPath_activities</name>
+            <name>workingPath</name>
             <description>the working dir base path</description>
         </property>
-        <property>
-            <name>shell_cmd_0</name>
-            <value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
-        </property>
-        <property>
-            <name>shell_cmd_1</name>
-            <value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
-        </property>
-        <property>
-            <name>shell_cmd_2</name>
-            <value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
-        </property>
-        <property>
-            <name>shell_cmd_3</name>
-            <value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
-        </property> 
-        <property>
-            <name>shell_cmd_4</name>
-            <value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
-        </property> 
-        <property>
-            <name>shell_cmd_5</name>
-            <value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
-        </property>  
-        <property>
-            <name>shell_cmd_6</name>
-            <value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
-        </property>
-        <property>
-            <name>shell_cmd_7</name>
-            <value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
-        </property>
-        <property>
-            <name>shell_cmd_8</name>
-            <value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
-        </property>
-        <property>
-            <name>shell_cmd_9</name>
-            <value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
-        </property> 
-        <property>
-            <name>shell_cmd_X</name>
-            <value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
-        </property>  
     </parameters>
 
     <start to="ResetWorkingPath"/>
@@ -80,436 +14,11 @@
     
     <action name="ResetWorkingPath">
         <fs>
-            <delete path='${workingPath_activities}/no_doi_works/*'/>
-            <delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
+            <delete path='${workingPath}/no_doi_enriched_works/output'/>
         </fs>
-        <ok to="fork_gen_orcid_author_work"/>
+        <ok to="Gen_Enriched_Orcid_Works"/>
         <error to="Kill"/>
     </action>
-    
-    <fork name = "fork_gen_orcid_author_work">
-      <path start = "check_exist_on_hdfs_activities_0"/>
-      <path start = "check_exist_on_hdfs_activities_1"/>
-      <path start = "check_exist_on_hdfs_activities_2"/>
-	  <path start = "check_exist_on_hdfs_activities_3"/>
-	  <path start = "check_exist_on_hdfs_activities_4"/>
-	  <path start = "check_exist_on_hdfs_activities_5"/>
-	  <path start = "check_exist_on_hdfs_activities_6"/>
-	  <path start = "check_exist_on_hdfs_activities_7"/>
-	  <path start = "check_exist_on_hdfs_activities_8"/>
-	  <path start = "check_exist_on_hdfs_activities_9"/>
-	  <path start = "check_exist_on_hdfs_activities_X"/>
-   	</fork>
-   	
-    <decision name="check_exist_on_hdfs_activities_0">
-         <switch>
-            <case to="GenOrcidAuthorWork_0">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
-            </case>
-            <default to="Download_0" />
-         </switch>
-	</decision>
-	
-    <action name="Download_0">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_0}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_0"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_0">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_1">
-         <switch>
-            <case to="GenOrcidAuthorWork_1">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
-            </case>
-            <default to="Download_1" />
-         </switch>
-	</decision>
-	
-    <action name="Download_1">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_1}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_1"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_1">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_2">
-         <switch>
-            <case to="GenOrcidAuthorWork_2">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
-            </case>
-            <default to="Download_2" />
-         </switch>
-	</decision>
-	
-    <action name="Download_2">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_2}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_2"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_2">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_3">
-         <switch>
-            <case to="GenOrcidAuthorWork_3">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
-            </case>
-            <default to="Download_3" />
-         </switch>
-	</decision>
-	
-    <action name="Download_3">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_3}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_3"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_3">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_4">
-         <switch>
-            <case to="GenOrcidAuthorWork_4">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
-            </case>
-            <default to="Download_4" />
-         </switch>
-	</decision>
-	
-    <action name="Download_4">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_4}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_4"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_4">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_5">
-         <switch>
-            <case to="GenOrcidAuthorWork_5">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
-            </case>
-            <default to="Download_5" />
-         </switch>
-	</decision>
-	
-    <action name="Download_5">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_5}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_5"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_5">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_6">
-         <switch>
-            <case to="GenOrcidAuthorWork_6">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
-            </case>
-            <default to="Download_6" />
-         </switch>
-	</decision>
-	
-    <action name="Download_6">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_6}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_6"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_6">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    
-    <decision name="check_exist_on_hdfs_activities_7">
-         <switch>
-            <case to="GenOrcidAuthorWork_7">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
-            </case>
-            <default to="Download_7" />
-         </switch>
-	</decision>
-	
-    <action name="Download_7">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_7}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_7"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_7">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_8">
-         <switch>
-            <case to="GenOrcidAuthorWork_8">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
-            </case>
-            <default to="Download_8" />
-         </switch>
-	</decision>
-	
-    <action name="Download_8">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_8}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_8"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_8">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_9">
-         <switch>
-            <case to="GenOrcidAuthorWork_9">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
-            </case>
-            <default to="Download_9" />
-         </switch>
-	</decision>
-	
-    <action name="Download_9">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_9}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_9"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_9">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-    
-    <decision name="check_exist_on_hdfs_activities_X">
-         <switch>
-            <case to="GenOrcidAuthorWork_X">
-              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
-            </case>
-            <default to="Download_X" />
-         </switch>
-	</decision>
-	
-    <action name="Download_X">
-		<shell xmlns="uri:oozie:shell-action:0.1">
-		<job-tracker>${jobTracker}</job-tracker>
-		<name-node>${nameNode}</name-node>
-		<exec>bash</exec>
-	    <argument>-c</argument>
-	    <argument>${shell_cmd_X}</argument>
-		<capture-output/>
-		</shell>
-	<ok to="GenOrcidAuthorWork_X"/>
-	<error to="Kill"/>
-	</action>
-	
-	<action name="GenOrcidAuthorWork_X">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
-            <arg>-w</arg><arg>${workingPath_activities}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
-            <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
-        </java>
-        <ok to="join_node"/>
-        <error to="Kill"/>
-    </action>
-
-    <join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
 
     <action name="Gen_Enriched_Orcid_Works">
         <spark xmlns="uri:oozie:spark-action:0.2">
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
index 7a8d04187..51e00dc0f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
@@ -1,9 +1,15 @@
-<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
     <parameters>
         <property>
             <name>workingPath</name>
             <description>the working dir base path</description>
         </property>
+        <property>
+            <name>shell_cmd_0</name>
+            <value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid summaries</description>
+        </property>
     </parameters>
     
     <start to="ResetWorkingPath"/>
@@ -15,24 +21,44 @@
     
     <action name="ResetWorkingPath">
         <fs>
-            <delete path='${workingPath}/output'/>
-            <mkdir path='${workingPath}/output'/>
+            <delete path='${workingPath}/summaries/output'/>
+            <mkdir path='${workingPath}/summaries/output'/>
         </fs>
-        <ok to="ImportOrcidSummary"/>
+        <ok to="check_exist_on_hdfs_summaries"/>
+        <error to="Kill"/>
+    </action>
+
+    <decision name="check_exist_on_hdfs_summaries">
+        <switch>
+            <case to="ImportOrcidSummaries">
+                ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
+            </case>
+            <default to="DownloadSummaries" />
+        </switch>
+    </decision>
+
+    <action name="DownloadSummaries">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>bash</exec>
+            <argument>-c</argument>
+            <argument>${shell_cmd_0}</argument>
+            <capture-output/>
+        </shell>
+        <ok to="ImportOrcidSummaries"/>
         <error to="Kill"/>
     </action>
     
-    
-    
-    <action name="ImportOrcidSummary">
+    <action name="ImportOrcidSummaries">
         <java>
             <job-tracker>${jobTracker}</job-tracker>
             <name-node>${nameNode}</name-node>
             <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
-            <arg>-d</arg><arg>${workingPath}/</arg>
+            <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
-            <arg>-o</arg><arg>output/</arg>
+            <arg>-o</arg><arg>summaries/output/</arg>
         </java>
         <ok to="End"/>
         <error to="Kill"/>
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
new file mode 100644
index 000000000..3068562d0
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
@@ -0,0 +1,31 @@
+<configuration>
+    <property>
+        <name>oozie.action.sharelib.for.java</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.map.java.opts</name>
+        <value>-Xmx4g</value>
+    </property>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
new file mode 100644
index 000000000..8f9a5123e
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
@@ -0,0 +1,514 @@
+<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>shell_cmd_0</name>
+            <value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
+        </property>
+        <property>
+            <name>shell_cmd_1</name>
+            <value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
+        </property>
+        <property>
+            <name>shell_cmd_2</name>
+            <value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
+        </property>
+        <property>
+            <name>shell_cmd_3</name>
+            <value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
+        </property> 
+        <property>
+            <name>shell_cmd_4</name>
+            <value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
+        </property> 
+        <property>
+            <name>shell_cmd_5</name>
+            <value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
+        </property>  
+        <property>
+            <name>shell_cmd_6</name>
+            <value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
+        </property>
+        <property>
+            <name>shell_cmd_7</name>
+            <value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
+        </property>
+        <property>
+            <name>shell_cmd_8</name>
+            <value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
+        </property>
+        <property>
+            <name>shell_cmd_9</name>
+            <value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
+        </property> 
+        <property>
+            <name>shell_cmd_X</name>
+            <value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
+        </property>  
+    </parameters>
+
+    <start to="ResetWorkingPath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath}/no_doi_works/*'/>
+        </fs>
+        <ok to="fork_gen_orcid_author_work"/>
+        <error to="Kill"/>
+    </action>
+    
+    <fork name = "fork_gen_orcid_author_work">
+      <path start = "check_exist_on_hdfs_activities_0"/>
+      <path start = "check_exist_on_hdfs_activities_1"/>
+      <path start = "check_exist_on_hdfs_activities_2"/>
+	  <path start = "check_exist_on_hdfs_activities_3"/>
+	  <path start = "check_exist_on_hdfs_activities_4"/>
+	  <path start = "check_exist_on_hdfs_activities_5"/>
+	  <path start = "check_exist_on_hdfs_activities_6"/>
+	  <path start = "check_exist_on_hdfs_activities_7"/>
+	  <path start = "check_exist_on_hdfs_activities_8"/>
+	  <path start = "check_exist_on_hdfs_activities_9"/>
+	  <path start = "check_exist_on_hdfs_activities_X"/>
+   	</fork>
+   	
+    <decision name="check_exist_on_hdfs_activities_0">
+         <switch>
+            <case to="GenOrcidAuthorWork_0">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
+            </case>
+            <default to="Download_0" />
+         </switch>
+	</decision>
+	
+    <action name="Download_0">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_0}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_0"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_0">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_1">
+         <switch>
+            <case to="GenOrcidAuthorWork_1">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
+            </case>
+            <default to="Download_1" />
+         </switch>
+	</decision>
+	
+    <action name="Download_1">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_1}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_1"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_1">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_2">
+         <switch>
+            <case to="GenOrcidAuthorWork_2">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
+            </case>
+            <default to="Download_2" />
+         </switch>
+	</decision>
+	
+    <action name="Download_2">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_2}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_2"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_2">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_3">
+         <switch>
+            <case to="GenOrcidAuthorWork_3">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
+            </case>
+            <default to="Download_3" />
+         </switch>
+	</decision>
+	
+    <action name="Download_3">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_3}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_3"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_3">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_4">
+         <switch>
+            <case to="GenOrcidAuthorWork_4">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
+            </case>
+            <default to="Download_4" />
+         </switch>
+	</decision>
+	
+    <action name="Download_4">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_4}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_4"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_4">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_5">
+         <switch>
+            <case to="GenOrcidAuthorWork_5">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
+            </case>
+            <default to="Download_5" />
+         </switch>
+	</decision>
+	
+    <action name="Download_5">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_5}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_5"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_5">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_6">
+         <switch>
+            <case to="GenOrcidAuthorWork_6">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
+            </case>
+            <default to="Download_6" />
+         </switch>
+	</decision>
+	
+    <action name="Download_6">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_6}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_6"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_6">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    
+    <decision name="check_exist_on_hdfs_activities_7">
+         <switch>
+            <case to="GenOrcidAuthorWork_7">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
+            </case>
+            <default to="Download_7" />
+         </switch>
+	</decision>
+	
+    <action name="Download_7">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_7}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_7"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_7">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_8">
+         <switch>
+            <case to="GenOrcidAuthorWork_8">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
+            </case>
+            <default to="Download_8" />
+         </switch>
+	</decision>
+	
+    <action name="Download_8">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_8}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_8"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_8">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_9">
+         <switch>
+            <case to="GenOrcidAuthorWork_9">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
+            </case>
+            <default to="Download_9" />
+         </switch>
+	</decision>
+	
+    <action name="Download_9">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_9}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_9"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_9">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_X">
+         <switch>
+            <case to="GenOrcidAuthorWork_X">
+              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
+            </case>
+            <default to="Download_X" />
+         </switch>
+	</decision>
+	
+    <action name="Download_X">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_X}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_X"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_X">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name = "join_node" to = "End"/>
+    
+   <end name="End"/>
+</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
new file mode 100644
index 000000000..e77dd09c9
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
@@ -0,0 +1,22 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
new file mode 100644
index 000000000..3362cc67b
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
@@ -0,0 +1,68 @@
+<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>shell_cmd_0</name>
+            <value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid summaries</description>
+        </property>
+    </parameters>
+    
+    <start to="ResetWorkingPath"/>
+    
+    
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath}/summaries/output'/>
+            <mkdir path='${workingPath}/summaries/output'/>
+        </fs>
+        <ok to="check_exist_on_hdfs_summaries"/>
+        <error to="Kill"/>
+    </action>
+
+    <decision name="check_exist_on_hdfs_summaries">
+        <switch>
+            <case to="ImportOrcidSummaries">
+                ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
+            </case>
+            <default to="DownloadSummaries" />
+        </switch>
+    </decision>
+
+    <action name="DownloadSummaries">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>bash</exec>
+            <argument>-c</argument>
+            <argument>${shell_cmd_0}</argument>
+            <capture-output/>
+        </shell>
+        <ok to="ImportOrcidSummaries"/>
+        <error to="Kill"/>
+    </action>
+    
+    <action name="ImportOrcidSummaries">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
+            <arg>-o</arg><arg>summaries/output/</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index 8b50f2d8f..5e0f91ecd 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid;
 
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.*;
+import java.nio.file.Files;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
@@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients;
 import org.junit.jupiter.api.Test;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import jdk.nashorn.internal.ir.annotations.Ignore;
 
 public class OrcidClientTest {
 	final String orcidId = "0000-0001-7291-3210";
@@ -32,11 +32,20 @@ public class OrcidClientTest {
 	String lastUpdate = "2019-09-30 00:00:00";
 	String shortDate = "2020-05-06 16:06:11";
 
-//	curl -i -H "Accept: application/vnd.orcid+xml" 
+//	curl -i -H "Accept: application/vnd.orcid+xml"
 //	-H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
 //	'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
 
-	public String testDownloadRecord(String orcidId) throws Exception {
+	@Test
+	public void downloadTest() throws Exception {
+		String record = testDownloadRecord("0000-0002-2536-4498");
+		File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
+		OutputStream outStream = new FileOutputStream(f);
+		IOUtils.write(record.getBytes(), outStream);
+		System.out.println("saved to tmp");
+	}
+
+	private String testDownloadRecord(String orcidId) throws Exception {
 		try (CloseableHttpClient client = HttpClients.createDefault()) {
 			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
 			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
@@ -100,7 +109,7 @@ public class OrcidClientTest {
 	}
 
 //	@Test
-	public void getRecordDatestamp() throws ParseException {
+	private void getRecordDatestamp() throws ParseException {
 		Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
 		Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
 		Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
@@ -108,7 +117,7 @@ public class OrcidClientTest {
 		assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
 	}
 
-	public void testDate(String value) throws ParseException {
+	private void testDate(String value) throws ParseException {
 		System.out.println(value.toString());
 		if (value.length() != 19) {
 			value = value.substring(0, 19);
@@ -118,14 +127,16 @@ public class OrcidClientTest {
 	}
 
 //	@Test
-	public void testModifiedDate() throws ParseException {
+	@Ignore
+	private void testModifiedDate() throws ParseException {
 		testDate(toRetrieveDate);
 		testDate(toNotRetrieveDate);
 		testDate(shortDate);
 	}
 
 //	@Test
-	public void testReadBase64CompressedRecord() throws Exception {
+	@Ignore
+	private void testReadBase64CompressedRecord() throws Exception {
 		final String base64CompressedRecord = IOUtils
 			.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
 		final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
index 4d04e1a16..39f78522f 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
@@ -13,14 +13,15 @@ import com.google.gson.JsonParser;
 
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
+import jdk.nashorn.internal.ir.annotations.Ignore;
 
 public class PublicationToOafTest {
 
 	private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
 
 	@Test
-//	@Ignore
-	public void convertOafPublicationTest() throws Exception {
+	@Ignore
+	private void convertOafPublicationTest() throws Exception {
 		String jsonPublication = IOUtils
 			.toString(
 				PublicationToOafTest.class.getResourceAsStream("publication.json"));
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index d426b01f1..ca91a242a 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -42,12 +42,12 @@ public class OrcidNoDoiTest {
 
 	@Test
 	@Ignore
-	private void readPublicationFieldsTest()
+	public void readPublicationFieldsTest()
 		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
 		logger.info("running loadPublicationFieldsTest ....");
 		String xml = IOUtils
 			.toString(
-				OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
+				OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
 
 		if (xml == null) {
 			logger.info("Resource not found");
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml
new file mode 100644
index 000000000..43d3b2351
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<work:work xmlns:address="http://www.orcid.org/ns/address"
+           xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
+           xmlns:employment="http://www.orcid.org/ns/employment"
+           xmlns:education="http://www.orcid.org/ns/education"
+           xmlns:other-name="http://www.orcid.org/ns/other-name"
+           xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+           xmlns:funding="http://www.orcid.org/ns/funding"
+           xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+           xmlns:service="http://www.orcid.org/ns/service"
+           xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+           xmlns:distinction="http://www.orcid.org/ns/distinction"
+           xmlns:internal="http://www.orcid.org/ns/internal"
+           xmlns:membership="http://www.orcid.org/ns/membership"
+           xmlns:person="http://www.orcid.org/ns/person"
+           xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+           xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+           xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+           xmlns:activities="http://www.orcid.org/ns/activities"
+           xmlns:qualification="http://www.orcid.org/ns/qualification"
+           xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+           xmlns:error="http://www.orcid.org/ns/error"
+           xmlns:preferences="http://www.orcid.org/ns/preferences"
+           xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+           xmlns:work="http://www.orcid.org/ns/work"
+           xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="63461376"
+           path="/0000-0002-2536-4498/work/63461376" visibility="public">
+    <common:created-date>2019-10-22T03:18:13.755Z</common:created-date>
+    <common:last-modified-date>2020-06-17T11:07:13.703Z</common:last-modified-date>
+    <common:source>
+        <common:source-client-id>
+            <common:uri>https://orcid.org/client/0000-0001-8607-8906</common:uri>
+            <common:path>0000-0001-8607-8906</common:path>
+            <common:host>orcid.org</common:host>
+        </common:source-client-id>
+        <common:source-name>INSPIRE-HEP</common:source-name>
+    </common:source>
+    <work:title>
+        <common:title>Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector</common:title>
+    </work:title>
+    <common:external-ids>
+        <common:external-id>
+            <common:external-id-type>other-id</common:external-id-type>
+            <common:external-id-value>1759875</common:external-id-value>
+            <common:external-id-normalized transient="true">1759875</common:external-id-normalized>
+            <common:external-id-url>http://inspirehep.net/record/1759875</common:external-id-url>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+        <common:external-id>
+            <common:external-id-type>doi</common:external-id-type>
+            <common:external-id-value>10.1140/epjc/s10052-020-7907-9</common:external-id-value>
+            <common:external-id-normalized transient="true">10.1140/epjc/s10052-020-7907-9</common:external-id-normalized>
+            <common:external-id-url>http://dx.doi.org/10.1140/epjc/s10052-020-7907-9</common:external-id-url>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+        <common:external-id>
+            <common:external-id-type>arxiv</common:external-id-type>
+            <common:external-id-value>1910.08819</common:external-id-value>
+            <common:external-id-normalized transient="true">arXiv:1910.08819</common:external-id-normalized>
+            <common:external-id-url>http://arxiv.org/abs/1910.08819</common:external-id-url>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+    </common:external-ids>
+    <common:url>http://inspirehep.net/record/1759875</common:url>
+    <work:type>journal-article</work:type>
+    <common:publication-date>
+        <common:year>2020</common:year>
+        <common:month>06</common:month>
+        <common:day>12</common:day>
+    </common:publication-date>
+    <work:journal-title>Eur.Phys.J.C</work:journal-title>
+</work:work>

From c82b15b5f4817348d446fbf53f8337a5e3601085 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 28 Jul 2020 15:23:52 +0200
Subject: [PATCH 08/34] migrate configuration to ocean, fix publication dataset
 creation

---
 .../SparkGenEnrichedOrcidWorks.java           | 13 ++--
 .../oozie_app/config-default.xml              | 31 ---------
 .../oozie_app/workflow.xml                    | 68 ++++++++++++++++---
 .../orcid/xml/XMLRecordParserTest.java        |  6 +-
 4 files changed, 68 insertions(+), 50 deletions(-)
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index b0b989463..b24e71615 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -91,19 +91,18 @@ public class SparkGenEnrichedOrcidWorks {
 						Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
 					.filter(Objects::nonNull)
 					.toJavaRDD();
-				logger.info("Works enriched data created: " + enrichedWorksRDD.count());
 				enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
 				logger.info("Works enriched data saved");
-				JavaRDD<Tuple2<String, Publication>> oafPublicationRDD = enrichedWorksRDD.map(e -> {
+				JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD.map(e -> {
 					JsonElement j = new JsonParser().parse(e._2());
-					return new Tuple2<>(e._1(), (Publication) PublicationToOaf
-						.generatePublicationActionsFromDump(j.getAsJsonObject()));
-				});
+					return (Publication) PublicationToOaf
+						.generatePublicationActionsFromDump(j.getAsJsonObject());
+				}).filter(p -> p != null);
 
-				Dataset<Tuple2<String, Publication>> publicationDataset = spark
+				Dataset<Publication> publicationDataset = spark
 					.createDataset(
 						oafPublicationRDD.repartition(1).rdd(),
-						Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class)));
+						Encoders.bean(Publication.class));
 				publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
 			});
 	}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
deleted file mode 100644
index 3068562d0..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
+++ /dev/null
@@ -1,31 +0,0 @@
-<configuration>
-    <property>
-        <name>oozie.action.sharelib.for.java</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.launcher.mapreduce.map.java.opts</name>
-        <value>-Xmx4g</value>
-    </property>
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
index a60af8b45..faed3104a 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
@@ -1,11 +1,56 @@
-<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
     <parameters>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
         <property>
             <name>workingPath</name>
             <description>the working dir base path</description>
         </property>
     </parameters>
 
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
     <start to="ResetWorkingPath"/>
 
     <kill name="Kill">
@@ -16,20 +61,25 @@
         <fs>
             <delete path='${workingPath}/no_doi_enriched_works/output'/>
         </fs>
-        <ok to="Gen_Enriched_Orcid_Works"/>
+        <ok to="GenOrcidNoDoiDataset"/>
         <error to="Kill"/>
     </action>
 
-    <action name="Gen_Enriched_Orcid_Works">
+    <action name="GenOrcidNoDoiDataset">
         <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn</master>
+            <master>yarn-cluster</master>
             <mode>cluster</mode>
-            <name>Gen_Enriched_Orcid_Works</name>
+            <name>GenOrcidNoDoiDataset</name>
             <class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
-            <jar>dhp-doiboost-1.2.4-SNAPSHOT.jar</jar>
-            <spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
             </spark-opts>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
index 4d8237f77..5bf6f27b9 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
@@ -12,7 +12,7 @@ import eu.dnetlib.doiboost.orcid.model.WorkData;
 public class XMLRecordParserTest {
 
 	@Test
-	public void testOrcidAuthorDataXMLParser() throws Exception {
+	private void testOrcidAuthorDataXMLParser() throws Exception {
 
 		String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
 
@@ -27,7 +27,7 @@ public class XMLRecordParserTest {
 	}
 
 	@Test
-	public void testOrcidXMLErrorRecordParser() throws Exception {
+	private void testOrcidXMLErrorRecordParser() throws Exception {
 
 		String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
 
@@ -40,7 +40,7 @@ public class XMLRecordParserTest {
 	}
 
 	@Test
-	public void testOrcidWorkDataXMLParser() throws Exception {
+	private void testOrcidWorkDataXMLParser() throws Exception {
 
 		String xml = IOUtils
 			.toString(

From 196f36c6edd10203ff304e7cd122b3b679593618 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 30 Jul 2020 13:38:33 +0200
Subject: [PATCH 09/34] fix publication dataset creation

---
 .../SparkGenEnrichedOrcidWorks.java           |  47 +++++--
 .../orcidnodoi/oaf/PublicationToOaf.java      | 117 +++++++++++++-----
 .../orcidnodoi/PublicationToOafTest.java      |   3 +-
 3 files changed, 128 insertions(+), 39 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index b24e71615..cae5a168f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -17,10 +17,12 @@ import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 
@@ -93,17 +95,48 @@ public class SparkGenEnrichedOrcidWorks {
 					.toJavaRDD();
 				enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
 				logger.info("Works enriched data saved");
-				JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD.map(e -> {
-					JsonElement j = new JsonParser().parse(e._2());
-					return (Publication) PublicationToOaf
-						.generatePublicationActionsFromDump(j.getAsJsonObject());
-				}).filter(p -> p != null);
+
+				final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
+				final LongAccumulator enrichedPublications = spark
+					.sparkContext()
+					.longAccumulator("enrichedPublications");
+				final LongAccumulator errorsGeneric = spark.sparkContext().longAccumulator("errorsGeneric");
+				final LongAccumulator errorsInvalidTitle = spark.sparkContext().longAccumulator("errorsInvalidTitle");
+				final LongAccumulator errorsNotFoundAuthors = spark
+					.sparkContext()
+					.longAccumulator("errorsNotFoundAuthors");
+				final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
+				final PublicationToOaf publicationToOaf = new PublicationToOaf(
+					parsedPublications,
+					enrichedPublications,
+					errorsGeneric,
+					errorsInvalidTitle,
+					errorsNotFoundAuthors,
+					errorsInvalidType);
+				JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
+					.map(
+						e -> {
+							return (Publication) publicationToOaf
+								.generatePublicationActionsFromJson(e._2());
+						})
+					.filter(p -> p != null);
 
 				Dataset<Publication> publicationDataset = spark
 					.createDataset(
-						oafPublicationRDD.repartition(1).rdd(),
+						oafPublicationRDD.rdd(),
 						Encoders.bean(Publication.class));
-				publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output");
+				publicationDataset
+					.write()
+					.format("parquet")
+					.mode(SaveMode.Overwrite)
+					.save(workingPath + "no_doi_dataset/output");
+
+				logger.info("parsedPublications: " + parsedPublications.value().toString());
+				logger.info("enrichedPublications: " + enrichedPublications.value().toString());
+				logger.info("errorsGeneric: " + errorsGeneric.value().toString());
+				logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
+				logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
+				logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
 			});
 	}
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index 19bfe0f30..448fa9a74 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -3,18 +3,17 @@ package eu.dnetlib.doiboost.orcidnodoi.oaf;
 
 import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
 
+import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.gson.Gson;
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
+import com.google.gson.*;
 
 import eu.dnetlib.dhp.common.PacePerson;
 import eu.dnetlib.dhp.schema.oaf.*;
@@ -22,7 +21,7 @@ import eu.dnetlib.dhp.utils.DHPUtils;
 import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
 import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
 
-public class PublicationToOaf {
+public class PublicationToOaf implements Serializable {
 
 	static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
 
@@ -31,6 +30,37 @@ public class PublicationToOaf {
 	public static final String OPENAIRE_PREFIX = "openaire____";
 	public static final String SEPARATOR = "::";
 
+	private final LongAccumulator parsedPublications;
+	private final LongAccumulator enrichedPublications;
+	private final LongAccumulator errorsGeneric;
+	private final LongAccumulator errorsInvalidTitle;
+	private final LongAccumulator errorsNotFoundAuthors;
+	private final LongAccumulator errorsInvalidType;
+
+	public PublicationToOaf(
+		LongAccumulator parsedPublications,
+		LongAccumulator enrichedPublications,
+		LongAccumulator errorsGeneric,
+		LongAccumulator errorsInvalidTitle,
+		LongAccumulator errorsNotFoundAuthors,
+		LongAccumulator errorsInvalidType) {
+		this.parsedPublications = parsedPublications;
+		this.enrichedPublications = enrichedPublications;
+		this.errorsGeneric = errorsGeneric;
+		this.errorsInvalidTitle = errorsInvalidTitle;
+		this.errorsNotFoundAuthors = errorsNotFoundAuthors;
+		this.errorsInvalidType = errorsInvalidType;
+	}
+
+	public PublicationToOaf() {
+		this.parsedPublications = null;
+		this.enrichedPublications = null;
+		this.errorsGeneric = null;
+		this.errorsInvalidTitle = null;
+		this.errorsNotFoundAuthors = null;
+		this.errorsInvalidType = null;
+	}
+
 	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
 
 		{
@@ -69,11 +99,27 @@ public class PublicationToOaf {
 
 	public static final String PID_TYPES = "dnet:pid_types";
 
-	public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
+	public Oaf generatePublicationActionsFromJson(final String json) {
+		try {
+			if (parsedPublications != null) {
+				parsedPublications.add(1);
+			}
+			JsonElement jElement = new JsonParser().parse(json);
+			JsonObject jObject = jElement.getAsJsonObject();
+			return generatePublicationActionsFromDump(jObject);
+		} catch (Throwable t) {
+			logger.error("creating publication: " + t.getMessage());
+			if (errorsGeneric != null) {
+				errorsGeneric.add(1);
+			}
+			return null;
+		}
+	}
+
+	public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
 
 		logger.debug("generatePublicationActionsFromDump ...");
-		if (!isValid(rootElement/* , context */)) {
-			logger.error("publication not valid");
+		if (!isValid(rootElement)) {
 			return null;
 		}
 
@@ -122,8 +168,9 @@ public class PublicationToOaf {
 		// Adding titles
 		final List<String> titles = createRepeatedField(rootElement, "titles");
 		if (titles == null || titles.isEmpty()) {
-			logger.error("titles not found");
-//            context.incrementCounter("filtered", "title_not_found", 1);
+			if (errorsInvalidTitle != null) {
+				errorsInvalidTitle.add(1);
+			}
 			return null;
 		}
 		Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
@@ -189,8 +236,9 @@ public class PublicationToOaf {
 
 			publication.setInstance(Arrays.asList(instance));
 		} else {
-			logger.error("type not found");
-//            context.incrementCounter("filtered", "type_not_found", 1);
+			if (errorsInvalidType != null) {
+				errorsInvalidType.add(1);
+			}
 			return null;
 		}
 
@@ -199,17 +247,21 @@ public class PublicationToOaf {
 		if (authors != null && authors.size() > 0) {
 			publication.setAuthor(authors);
 		} else {
-			logger.error("authors not found");
-//            context.incrementCounter("filtered", "author_not_found", 1);
+			if (errorsNotFoundAuthors != null) {
+				errorsNotFoundAuthors.add(1);
+			}
 			return null;
 		}
 		String classValue = getDefaultResulttype(cobjValue);
 		publication
 			.setResulttype(mapQualifier(classValue, classValue, "dnet:result_typologies", "dnet:result_typologies"));
+		if (enrichedPublications != null) {
+			enrichedPublications.add(1);
+		}
 		return publication;
 	}
 
-	public static List<Author> createAuthors(final JsonObject root) {
+	public List<Author> createAuthors(final JsonObject root) {
 
 		final String authorsJSONFieldName = "contributors";
 
@@ -273,7 +325,7 @@ public class PublicationToOaf {
 		return null;
 	}
 
-	private static List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
+	private List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
 		if (!rootElement.has(fieldName)) {
 			return null;
 		}
@@ -291,14 +343,14 @@ public class PublicationToOaf {
 		}
 	}
 
-	private static String cleanField(String value) {
+	private String cleanField(String value) {
 		if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
 			value = value.substring(1, value.length() - 1);
 		}
 		return value;
 	}
 
-	private static void settingRelevantDate(final JsonObject rootElement,
+	private void settingRelevantDate(final JsonObject rootElement,
 		final Publication publication,
 		final String jsonKey,
 		final String dictionaryKey,
@@ -322,7 +374,7 @@ public class PublicationToOaf {
 		}
 	}
 
-	private static String getPublicationDate(final JsonObject rootElement,
+	private String getPublicationDate(final JsonObject rootElement,
 		final String jsonKey) {
 
 		JsonObject pubDateJson = null;
@@ -358,24 +410,27 @@ public class PublicationToOaf {
 		return null;
 	}
 
-	protected static boolean isValid(final JsonObject rootElement/* , final Reporter context */) {
+	protected boolean isValid(final JsonObject rootElement/* , final Reporter context */) {
 
 		final String type = getStringValue(rootElement, "type");
 		if (!typologiesMapping.containsKey(type)) {
 			logger.error("unknowntype_" + type);
-//            context.incrementCounter("filtered", "unknowntype_" + type, 1);
+			if (errorsInvalidType != null) {
+				errorsInvalidType.add(1);
+			}
 			return false;
 		}
 
 		if (!isValidJsonArray(rootElement, "titles")) {
-			logger.error("invalid_title");
-//            context.incrementCounter("filtered", "invalid_title", 1);
+			if (errorsInvalidTitle != null) {
+				errorsInvalidTitle.add(1);
+			}
 			return false;
 		}
 		return true;
 	}
 
-	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
+	private boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
 		if (!rootElement.has(fieldName)) {
 			return false;
 		}
@@ -395,7 +450,7 @@ public class PublicationToOaf {
 		return true;
 	}
 
-	private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
+	private Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
 		final Qualifier qualifier = new Qualifier();
 		qualifier.setClassid(classId);
 		qualifier.setClassname(className);
@@ -404,7 +459,7 @@ public class PublicationToOaf {
 		return qualifier;
 	}
 
-	private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId,
+	private ExternalReference convertExtRef(String extId, String classId, String className, String schemeId,
 		String schemeName) {
 		ExternalReference ex = new ExternalReference();
 		ex.setRefidentifier(extId);
@@ -412,7 +467,7 @@ public class PublicationToOaf {
 		return ex;
 	}
 
-	private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
+	private StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
 		if (value == null | StringUtils.isBlank(value)) {
 			return null;
 		}
@@ -424,7 +479,7 @@ public class PublicationToOaf {
 		return structuredProperty;
 	}
 
-	private static Field<String> mapStringField(String value, DataInfo dataInfo) {
+	private Field<String> mapStringField(String value, DataInfo dataInfo) {
 		if (value == null || StringUtils.isBlank(value)) {
 			return null;
 		}
@@ -435,21 +490,21 @@ public class PublicationToOaf {
 		return stringField;
 	}
 
-	private static KeyValue createCollectedFrom() {
+	private KeyValue createCollectedFrom() {
 		KeyValue cf = new KeyValue();
 		cf.setValue(ORCID);
 		cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
 		return cf;
 	}
 
-	private static KeyValue createHostedBy() {
+	private KeyValue createHostedBy() {
 		KeyValue hb = new KeyValue();
 		hb.setValue("Unknown Repository");
 		hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
 		return hb;
 	}
 
-	private static StructuredProperty mapAuthorId(String orcidId) {
+	private StructuredProperty mapAuthorId(String orcidId) {
 		final StructuredProperty sp = new StructuredProperty();
 		sp.setValue(orcidId);
 		final Qualifier q = new Qualifier();
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
index 39f78522f..01e26dcb4 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java
@@ -27,7 +27,8 @@ public class PublicationToOafTest {
 				PublicationToOafTest.class.getResourceAsStream("publication.json"));
 		JsonElement j = new JsonParser().parse(jsonPublication);
 		logger.info("json publication loaded: " + j.toString());
-		Publication oafPublication = (Publication) PublicationToOaf
+		PublicationToOaf publicationToOaf = new PublicationToOaf();
+		Publication oafPublication = (Publication) publicationToOaf
 			.generatePublicationActionsFromDump(j.getAsJsonObject());
 		assertNotNull(oafPublication.getId());
 		assertNotNull(oafPublication.getOriginalId());

From 0377b40fbad56c0dd75fb7c8287488a4f63ceffe Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 30 Jul 2020 18:38:07 +0200
Subject: [PATCH 10/34] output to one parquet file

---
 .../doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java  | 2 +-
 .../doiboost/orcidnodoi/oaf/PublicationToOaf.java        | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index cae5a168f..dea597cbb 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -123,7 +123,7 @@ public class SparkGenEnrichedOrcidWorks {
 
 				Dataset<Publication> publicationDataset = spark
 					.createDataset(
-						oafPublicationRDD.rdd(),
+						oafPublicationRDD.repartition(1).rdd(),
 						Encoders.bean(Publication.class));
 				publicationDataset
 					.write()
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index 448fa9a74..503df67ff 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory;
 import com.google.gson.*;
 
 import eu.dnetlib.dhp.common.PacePerson;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
@@ -217,6 +218,8 @@ public class PublicationToOaf implements Serializable {
 			final List<String> urls = createRepeatedField(rootElement, "urls");
 			if (urls != null && !urls.isEmpty()) {
 				instance.setUrl(urls);
+			} else {
+				dataInfo.setInvisible(true);
 			}
 
 			final String pubDate = getPublicationDate(rootElement, "publicationDates");
@@ -508,8 +511,10 @@ public class PublicationToOaf implements Serializable {
 		final StructuredProperty sp = new StructuredProperty();
 		sp.setValue(orcidId);
 		final Qualifier q = new Qualifier();
-		q.setClassid("ORCID");
-		q.setClassname("ORCID");
+		q.setClassid(ORCID.toLowerCase());
+		q.setClassname(ORCID.toLowerCase());
+		q.setSchemeid(ModelConstants.DNET_PID_TYPES);
+		q.setSchemename(ModelConstants.DNET_PID_TYPES);
 		sp.setQualifier(q);
 		return sp;
 	}

From 538f299767d433ba17681ab82f4b7a32bfb24a2c Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Mon, 14 Sep 2020 12:35:16 +0200
Subject: [PATCH 11/34] merged

---
 .../dhp/broker/oa/IndexNotificationsJob.java    | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
index cb7acb46d..792a2354a 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
@@ -47,8 +47,9 @@ public class IndexNotificationsJob {
 
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
-				.toString(IndexNotificationsJob.class
-					.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json")));
+				.toString(
+					IndexNotificationsJob.class
+						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json")));
 		parser.parseArgument(args);
 
 		final SparkConf conf = new SparkConf();
@@ -116,7 +117,8 @@ public class IndexNotificationsJob {
 		final long date) {
 		final List<Notification> list = subscriptions
 			.stream()
-			.filter(s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic()))
+			.filter(
+				s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic()))
 			.filter(s -> verifyConditions(e.getMap(), s.conditionsAsMap()))
 			.map(s -> generateNotification(s, e, date))
 			.collect(Collectors.toList());
@@ -147,15 +149,18 @@ public class IndexNotificationsJob {
 
 		if (conditions.containsKey("trust")
 			&& !SubscriptionUtils
-				.verifyFloatRange(map.getTrust(), conditions.get("trust").get(0).getValue(), conditions.get("trust").get(0).getOtherValue())) {
+				.verifyFloatRange(
+					map.getTrust(), conditions.get("trust").get(0).getValue(),
+					conditions.get("trust").get(0).getOtherValue())) {
 			return false;
 		}
 
 		if (conditions.containsKey("targetDateofacceptance") && !conditions
 			.get("targetDateofacceptance")
 			.stream()
-			.anyMatch(c -> SubscriptionUtils
-				.verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) {
+			.anyMatch(
+				c -> SubscriptionUtils
+					.verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) {
 			return false;
 		}
 

From 9e8e7fe6ef24dbf6a004190cf86cbc623c8b8d21 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 15 Sep 2020 11:32:49 +0200
Subject: [PATCH 12/34] add comments

---
 .../java/eu/dnetlib/doiboost/orcid/model/AuthorData.java  | 4 ++++
 .../dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java | 6 +++++-
 .../dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java   | 6 +++++-
 .../doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java   | 5 ++++-
 .../eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java   | 4 ++++
 .../eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java | 4 ++++
 .../eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java  | 4 ++++
 .../doiboost/orcidnodoi/model/PublicationDate.java        | 4 ++++
 .../dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java  | 4 ++++
 .../dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 5 ++++-
 .../doiboost/orcidnodoi/similarity/AuthorMatcher.java     | 8 ++++++++
 .../doiboost/orcidnodoi/util/DumpToActionsUtility.java    | 4 ++++
 .../doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java     | 4 ++++
 13 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
index 87f1f65c8..e0624509b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
@@ -3,6 +3,10 @@ package eu.dnetlib.doiboost.orcid.model;
 
 import java.io.Serializable;
 
+/**
+ * This class models the data that are retrieved from orcid publication
+ */
+
 public class AuthorData implements Serializable {
 
 	private String oid;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index 807f52972..d852a7023 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -20,10 +20,14 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
 import eu.dnetlib.doiboost.orcid.json.JsonHelper;
-import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
 
+/**
+ * This class write on hdfs one sequence file, the key is an orcid identifier and the
+ * value is an orcid publication in json format
+ */
+
 public class ActivitiesDumpReader {
 
 	private static final int MAX_XML_WORKS_PARSED = -1;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
index 041424ba9..d32e6d945 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -12,11 +12,15 @@ import org.mortbay.log.Log;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.doiboost.orcid.OrcidDSManager;
 
+/**
+ * This job generates one sequence file, the key is an orcid identifier and the
+ * value is an orcid publication in json format
+ */
+
 public class GenOrcidAuthorWork extends OrcidDSManager {
 
 	private String activitiesFileNameTarGz;
 	private String outputWorksPath;
-//	private String workingPath;
 
 	public static void main(String[] args) throws IOException, Exception {
 		GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index dea597cbb..b984ee2b2 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -22,7 +22,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 
@@ -35,6 +34,10 @@ import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
 import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
 import scala.Tuple2;
 
+/**
+ * This spark job generates one parquet file, containing orcid publications dataset
+ */
+
 public class SparkGenEnrichedOrcidWorks {
 
 	static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
index 7f7e3a10a..363cb13e6 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
@@ -6,6 +6,10 @@ import com.google.gson.JsonObject;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 
+/**
+ * This class converts an object to json and viceversa
+ */
+
 public class JsonWriter {
 
 	public static String create(AuthorData authorData) {
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
index 8a170de09..9a8651c85 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
@@ -5,6 +5,10 @@ import java.io.Serializable;
 
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 
+/**
+ * This class models the data related to a contributor, that are retrieved from an orcid publication
+ */
+
 public class Contributor extends AuthorData implements Serializable {
 	private String sequence;
 	private String role;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
index 865e54ae3..7fe50ce25 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java
@@ -1,6 +1,10 @@
 
 package eu.dnetlib.doiboost.orcidnodoi.model;
 
+/**
+ * This class models the data related to external id, that are retrieved from an orcid publication
+ */
+
 public class ExternalId {
 	private String type;
 	private String value;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
index 9282a80ba..5f794d8eb 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java
@@ -1,6 +1,10 @@
 
 package eu.dnetlib.doiboost.orcidnodoi.model;
 
+/**
+ * This class models the data related to a publication date, that are retrieved from an orcid publication
+ */
+
 public class PublicationDate {
 	private String year;
 	private String month;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
index 5756521e7..58f992d12 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
@@ -4,6 +4,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model;
 import java.io.Serializable;
 import java.util.List;
 
+/**
+ * This class models the data that are retrieved from orcid publication
+ */
+
 public class WorkDataNoDoi implements Serializable {
 
 	private String oid;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index 503df67ff..4d1408470 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -22,6 +22,10 @@ import eu.dnetlib.dhp.utils.DHPUtils;
 import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
 import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
 
+/**
+ * This class converts an orcid publication from json format to oaf
+ */
+
 public class PublicationToOaf implements Serializable {
 
 	static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
@@ -119,7 +123,6 @@ public class PublicationToOaf implements Serializable {
 
 	public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
 
-		logger.debug("generatePublicationActionsFromDump ...");
 		if (!isValid(rootElement)) {
 			return null;
 		}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
index 1e4c38bef..88c84ee89 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -21,6 +21,14 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 
+/**
+ * This class is used for searching from a list of publication contributors a
+ * specific author making a similarity check on both name and surname of the
+ * author with the credit name of each contributor of the list; as soon as
+ * the match is found (if exist) author informations are used to enrich the
+ * matched contribuotr inside contributors list
+ */
+
 public class AuthorMatcher {
 
 	private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
index 9b9f3c8b2..ea4e58c44 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
@@ -9,6 +9,10 @@ import org.apache.commons.lang3.StringUtils;
 import com.google.gson.JsonArray;
 import com.google.gson.JsonObject;
 
+/**
+ * Utility class
+ */
+
 public class DumpToActionsUtility {
 
 	private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
index ae96a322f..c5c115551 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
@@ -17,6 +17,10 @@ import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
 import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 
+/**
+ * This class is used for parsing xml data with vtd parser
+ */
+
 public class XMLRecordParserNoDoi {
 
 	private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);

From fefbcfb10682728a499ad6181e83519f66b708a7 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 22 Sep 2020 10:20:25 +0200
Subject: [PATCH 13/34] dependency version moved to main pom (PR review)

---
 dhp-workflows/dhp-doiboost/pom.xml | 2 +-
 pom.xml                            | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index e9768be7e..357a57367 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -87,7 +87,7 @@
         <dependency>
             <groupId>org.apache.commons</groupId>
             <artifactId>commons-text</artifactId>
-            <version>1.8</version>
+            <version>${common.text.version}</version>
         </dependency>
 
 
diff --git a/pom.xml b/pom.xml
index e88e1d51b..9897c8abe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -669,5 +669,6 @@
 		<common.compress.version>1.1</common.compress.version>
 		<json4s.version>3.5.3</json4s.version>
 		<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
+		<common.text.version>1.8</common.text.version>
 	</properties>
 </project>

From a97ad20c7bd7725ee513694d9b00aae0a19b19d2 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 22 Sep 2020 10:46:34 +0200
Subject: [PATCH 14/34] exception is now propagated (PR review)

---
 .../java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java   | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
index aa61c0117..8ebeab2e5 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@@ -48,15 +48,10 @@ public class OrcidDSManager {
 		return conf;
 	}
 
-	protected FileSystem initFileSystemObject(Configuration conf) {
+	protected FileSystem initFileSystemObject(Configuration conf) throws IOException {
 		// Get the filesystem - HDFS
 		FileSystem fs = null;
-		try {
-			fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
-		} catch (IOException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		}
+		fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
 		return fs;
 	}
 

From ab083f9946a219396b0099f29d67d7c492eec126 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 14:02:32 +0200
Subject: [PATCH 15/34] propagate exception on parsing work (PR request)

---
 .../dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java  | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index d852a7023..c73e1efd1 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -128,10 +128,7 @@ public class ActivitiesDumpReader {
 							}
 						}
 					} catch (Exception e) {
-						Log
-							.warn(
-								"Parsing work from tar archive and xml work: " + filename + "  " + e.getMessage());
-//						Log.warn(e);
+						throw new Exception(filename, e);
 					}
 
 					if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
@@ -143,7 +140,7 @@ public class ActivitiesDumpReader {
 					}
 				}
 			}
-		} catch (IOException e) {
+		} catch (Exception e) {
 			Log.warn("Parsing work from gzip archive: " + e.getMessage());
 			Log.warn(e);
 			throw new RuntimeException(e);

From c295c71ca0c77f5b3aed5817a872d9c9da77aade Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 14:07:26 +0200
Subject: [PATCH 16/34] added comment

---
 .../src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
index 8ebeab2e5..b62ad370e 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@@ -50,6 +50,7 @@ public class OrcidDSManager {
 
 	protected FileSystem initFileSystemObject(Configuration conf) throws IOException {
 		// Get the filesystem - HDFS
+		// if there is an exception, it will be propagate
 		FileSystem fs = null;
 		fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
 		return fs;

From c3114ba0aeaada891ff13de4c3e4f6469b40ba99 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 14:21:31 +0200
Subject: [PATCH 17/34] replaced null as return value with a more safe empty
 string

---
 .../dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index b984ee2b2..24f0f7a87 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -167,6 +167,6 @@ public class SparkGenEnrichedOrcidWorks {
 				return name.getAsString();
 			}
 		}
-		return null;
+		return new String("");
 	}
 }

From 846ba3087310024cf3e28fb2c88f10259323f5f6 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 14:36:18 +0200
Subject: [PATCH 18/34] if typologies mapping fails, an exception will be
 propagated

---
 .../eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index 4d1408470..deb83723b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -97,8 +97,8 @@ public class PublicationToOaf implements Serializable {
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
 			typologiesMapping = new Gson().fromJson(tt, Map.class);
-		} catch (final Exception e) {
-			logger.error("loading typologies", e);
+		} catch (Exception e) {
+			throw new RuntimeException("loading typologies", e);
 		}
 	}
 

From c58db1c8eab65e0e4ad7d27b7bbc5f815961f050 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 15:11:02 +0200
Subject: [PATCH 19/34] added filter on null value after map function

---
 .../eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index deb83723b..63979d1af 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -185,6 +185,7 @@ public class PublicationToOaf implements Serializable {
 					.map(t -> {
 						return mapStructuredProperty(t, q, null);
 					})
+						.filter(s -> s!=null)
 					.collect(Collectors.toList()));
 		// Adding identifier
 		final String id = getStringValue(rootElement, "id");
@@ -376,7 +377,7 @@ public class PublicationToOaf implements Serializable {
 						.map(r -> {
 							return mapStructuredProperty(r, q, null);
 						})
-						.collect(Collectors.toList()));
+						.filter(s -> s!=null).collect(Collectors.toList()));
 		}
 	}
 

From 1139d6568d392b61e97c12ce1ceac9b2a59b42e2 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 15:32:26 +0200
Subject: [PATCH 20/34] replaced null value with a more safe empty string as
 return value

---
 .../dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 8 +++++++-
 .../doiboost/orcidnodoi/util/DumpToActionsUtility.java    | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index 63979d1af..136356161 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -166,7 +166,13 @@ public class PublicationToOaf implements Serializable {
 		// Adding source
 		final String source = getStringValue(rootElement, "sourceName");
 		if (StringUtils.isNotBlank(source)) {
-			publication.setSource(Arrays.asList(mapStringField(source, null)));
+			Field<String> sourceField = mapStringField(source, null);
+			if (sourceField==null) {
+				publication.setSource(null);
+			}
+			else {
+				publication.setSource(Arrays.asList(sourceField));
+			}
 		}
 
 		// Adding titles
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
index ea4e58c44..8096c4e8e 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java
@@ -20,7 +20,7 @@ public class DumpToActionsUtility {
 	public static String getStringValue(final JsonObject root, final String key) {
 		if (root.has(key) && !root.get(key).isJsonNull())
 			return root.get(key).getAsString();
-		return null;
+		return new String("");
 	}
 
 	public static List<String> getArrayValues(final JsonObject root, final String key) {

From a38ab57062955b425c1fec90875c16a6954fb83d Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 15:43:50 +0200
Subject: [PATCH 21/34] let run test methods

---
 .../eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index ca91a242a..bf5aba99b 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -41,7 +41,7 @@ public class OrcidNoDoiTest {
 	String orcidIdA = "0000-0003-2760-1191";
 
 	@Test
-	@Ignore
+//	@Ignore
 	public void readPublicationFieldsTest()
 		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
 		logger.info("running loadPublicationFieldsTest ....");
@@ -95,7 +95,7 @@ public class OrcidNoDoiTest {
 	}
 
 	@Test
-	@Ignore
+//	@Ignore
 	private void authorMatchTest() throws Exception {
 		logger.info("running authorSimpleMatchTest ....");
 		String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";

From b0290dbcb7728da0b447d38953702ab681bb1ce0 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 16:20:46 +0200
Subject: [PATCH 22/34] moved all dependencies version to main pom.xml

---
 dhp-workflows/dhp-doiboost/pom.xml | 4 ++--
 pom.xml                            | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index 357a57367..b81299cd1 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -14,7 +14,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.0.1</version>
+                <version>${net.alchim31.maven.version}</version>
                 <executions>
                     <execution>
                         <id>scala-compile-first</id>
@@ -51,7 +51,7 @@
         <dependency>
             <groupId>org.apache.httpcomponents</groupId>
             <artifactId>httpclient</artifactId>
-            <version>4.3.4</version>
+            <version>${org.apache.httpcomponents.version}</version>
         </dependency>
         <dependency>
             <groupId>eu.dnetlib.dhp</groupId>
diff --git a/pom.xml b/pom.xml
index 9897c8abe..bae53fcc0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -670,5 +670,7 @@
 		<json4s.version>3.5.3</json4s.version>
 		<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
 		<common.text.version>1.8</common.text.version>
+		<org.apache.httpcomponents.version>4.3.4</org.apache.httpcomponents.version>
+		<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
 	</properties>
 </project>

From 210a50e4f486c195b627d462e64d8ee10c3dc70e Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 22 Oct 2020 16:24:42 +0200
Subject: [PATCH 23/34] replaced null value

---
 .../doiboost/orcidnodoi/oaf/PublicationToOaf.java      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index 136356161..ece59c3f1 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -167,10 +167,9 @@ public class PublicationToOaf implements Serializable {
 		final String source = getStringValue(rootElement, "sourceName");
 		if (StringUtils.isNotBlank(source)) {
 			Field<String> sourceField = mapStringField(source, null);
-			if (sourceField==null) {
+			if (sourceField == null) {
 				publication.setSource(null);
-			}
-			else {
+			} else {
 				publication.setSource(Arrays.asList(sourceField));
 			}
 		}
@@ -191,7 +190,7 @@ public class PublicationToOaf implements Serializable {
 					.map(t -> {
 						return mapStructuredProperty(t, q, null);
 					})
-						.filter(s -> s!=null)
+					.filter(s -> s != null)
 					.collect(Collectors.toList()));
 		// Adding identifier
 		final String id = getStringValue(rootElement, "id");
@@ -383,7 +382,8 @@ public class PublicationToOaf implements Serializable {
 						.map(r -> {
 							return mapStructuredProperty(r, q, null);
 						})
-						.filter(s -> s!=null).collect(Collectors.toList()));
+						.filter(s -> s != null)
+						.collect(Collectors.toList()));
 		}
 	}
 

From 6bc7dbeca76e94f6cb00725aa50753d61d122952 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Fri, 6 Nov 2020 13:47:50 +0100
Subject: [PATCH 24/34] first version of dataset successful generated from
 orcid dump 2020

---
 dhp-workflows/dhp-doiboost/pom.xml            |   2 -
 .../doiboost/orcid/OrcidDSManager.java        |   2 +-
 .../orcidnodoi/ActivitiesDumpReader.java      |   4 +-
 .../orcidnodoi/GenOrcidAuthorWork.java        |   2 +-
 .../SparkGenEnrichedOrcidWorks.java           |   8 +-
 .../orcidnodoi/similarity/AuthorMatcher.java  |   4 +-
 .../orcidnodoi/xml/XMLRecordParserNoDoi.java  |  53 +++---
 ... => gen_orcid_authors_from_summaries.json} |   0
 ...en_orcid_works-no-doi_from_activities.json |   7 +
 .../orcid/oozie_app/config-default.xml        |  42 -----
 .../dhp/doiboost/orcid/oozie_app/workflow.xml |  67 --------
 .../oozie_app/config-default.xml              |   2 +-
 .../orcid_activities/oozie_app/workflow.xml   | 156 +++++++++++-------
 .../oozie_app/config-default.xml              |   4 +
 .../orcid_summaries/oozie_app/workflow.xml    |  14 +-
 .../oozie_app/workflow.xml                    |   4 +-
 .../doiboost/orcid/OrcidClientTest.java       |   4 +-
 .../orcidnodoi/xml/OrcidNoDoiTest.java        |  60 ++++++-
 ..._work_0000-0003-2760-1191_contributors.xml | 101 ++++++++++++
 pom.xml                                       |  12 ++
 20 files changed, 320 insertions(+), 228 deletions(-)
 rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{create_orcid_authors_data.json => gen_orcid_authors_from_summaries.json} (100%)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
 rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{gen_enriched_orcid_works => orcidnodoi}/oozie_app/workflow.xml (95%)
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml

diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index b81299cd1..624dd7b31 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -51,7 +51,6 @@
         <dependency>
             <groupId>org.apache.httpcomponents</groupId>
             <artifactId>httpclient</artifactId>
-            <version>${org.apache.httpcomponents.version}</version>
         </dependency>
         <dependency>
             <groupId>eu.dnetlib.dhp</groupId>
@@ -87,7 +86,6 @@
         <dependency>
             <groupId>org.apache.commons</groupId>
             <artifactId>commons-text</artifactId>
-            <version>${common.text.version}</version>
         </dependency>
 
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
index b62ad370e..bf13db021 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@@ -62,7 +62,7 @@ public class OrcidDSManager {
 				.toString(
 					OrcidDSManager.class
 						.getResourceAsStream(
-							"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
 		parser.parseArgument(args);
 
 		hdfsServerUri = parser.get("hdfsServerUri");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
index c73e1efd1..c2cfafd87 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@@ -73,7 +73,7 @@ public class ActivitiesDumpReader {
 					SequenceFile.Writer.valueClass(Text.class))) {
 				while ((entry = tais.getNextTarEntry()) != null) {
 					String filename = entry.getName();
-
+					StringBuffer buffer = new StringBuffer();
 					try {
 						if (entry.isDirectory() || !filename.contains("works")) {
 
@@ -83,7 +83,7 @@ public class ActivitiesDumpReader {
 							BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
 																									// tarInput
 							String line;
-							StringBuffer buffer = new StringBuffer();
+							buffer = new StringBuffer();
 							while ((line = br.readLine()) != null) {
 								buffer.append(line);
 							}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
index d32e6d945..d3e9aeaef 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
 				.toString(
 					GenOrcidAuthorWork.class
 						.getResourceAsStream(
-							"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
 		parser.parseArgument(args);
 
 		hdfsServerUri = parser.get("hdfsServerUri");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 24f0f7a87..691ca3eee 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks {
 				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
 				JavaPairRDD<Text, Text> summariesRDD = sc
-					.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
+					.sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
 				Dataset<AuthorData> summariesDataset = spark
 					.createDataset(
 						summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
@@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks {
 						Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
 					.filter(Objects::nonNull)
 					.toJavaRDD();
-				enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
-				logger.info("Works enriched data saved");
+//				enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
+				logger.info("Enriched works RDD ready.");
 
 				final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
 				final LongAccumulator enrichedPublications = spark
@@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks {
 					.write()
 					.format("parquet")
 					.mode(SaveMode.Overwrite)
-					.save(workingPath + "no_doi_dataset/output");
+					.save(workingPath + outputEnrichedWorksPath);
 
 				logger.info("parsedPublications: " + parsedPublications.value().toString());
 				logger.info("enrichedPublications: " + enrichedPublications.value().toString());
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
index 88c84ee89..6a1468f4c 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -5,6 +5,7 @@ import java.io.IOException;
 import java.text.Normalizer;
 import java.util.*;
 
+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.text.similarity.JaroWinklerSimilarity;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -40,7 +41,7 @@ public class AuthorMatcher {
 		int matchCounter = 0;
 		List<Integer> matchCounters = Arrays.asList(matchCounter);
 		Contributor contributor = null;
-		contributors.forEach(c -> {
+		contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
 			if (simpleMatch(c.getCreditName(), author.getName()) ||
 				simpleMatch(c.getCreditName(), author.getSurname()) ||
 				simpleMatch(c.getCreditName(), author.getOtherName())) {
@@ -54,6 +55,7 @@ public class AuthorMatcher {
 			Optional<Contributor> optCon = contributors
 				.stream()
 				.filter(c -> c.isSimpleMatch())
+				.filter(c -> !StringUtils.isBlank(c.getCreditName()))
 				.map(c -> {
 					c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
 					return c;
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
index c5c115551..f4b093402 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java
@@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi {
 	private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
 		throws XPathParseException, NavException, XPathEvalException {
 		List<Contributor> contributors = new ArrayList<Contributor>();
-		int nameIndex = 0;
-		ap.selectXPath("//work:contributor/work:credit-name");
+		ap.selectXPath("//work:contributors/work:contributor");
 		while (ap.evalXPath() != -1) {
 			Contributor contributor = new Contributor();
-			int t = vn.getText();
-			if (t >= 0) {
-				contributor.setCreditName(vn.toNormalizedString(t));
-				contributors.add(nameIndex, contributor);
-				nameIndex++;
+			if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
+				int val = vn.getText();
+				if (val != -1) {
+					contributor.setCreditName(vn.toNormalizedString(val));
+				}
+				vn.toElement(VTDNav.PARENT);
 			}
-		}
-		if (contributors.size() == 0) {
-			return contributors;
-		}
-
-		int sequenceIndex = 0;
-		ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
-		while (ap.evalXPath() != -1) {
-			int t = vn.getText();
-			if (t >= 0) {
-				contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t));
-				sequenceIndex++;
-			}
-		}
-
-		int roleIndex = 0;
-		ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
-		while (ap.evalXPath() != -1) {
-			int t = vn.getText();
-			if (t >= 0) {
-				contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
-				roleIndex++;
+			if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
+				if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
+					int val = vn.getText();
+					if (val != -1) {
+						contributor.setSequence(vn.toNormalizedString(val));
+					}
+					vn.toElement(VTDNav.PARENT);
+				}
+				if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
+					int val = vn.getText();
+					if (val != -1) {
+						contributor.setRole(vn.toNormalizedString(val));
+					}
+					vn.toElement(VTDNav.PARENT);
+				}
+				vn.toElement(VTDNav.PARENT);
 			}
+			contributors.add(contributor);
 		}
 		return contributors;
 	}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json
similarity index 100%
rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json
rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json
new file mode 100644
index 000000000..c3a8f92ec
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json
@@ -0,0 +1,7 @@
+[
+ {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
+ {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
+ {"paramName":"f",   "paramLongName":"activitiesFileNameTarGz",	"paramDescription": "the name of the activities orcid file",	"paramRequired": true},
+ {"paramName":"ow",   "paramLongName":"outputWorksPath",	"paramDescription": "the relative folder of the sequencial file to write",	"paramRequired": true},
+ {"paramName":"oew",   "paramLongName":"outputEnrichedWorksPath",	"paramDescription": "the relative folder of the sequencial file to write the data",	"paramRequired": true}
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
deleted file mode 100644
index fe14bb8cb..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://hadoop-edge2.garr-pa1.d4science.org:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://hadoop-edge1.garr-pa1.d4science.org:18089/</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
deleted file mode 100644
index 51e00dc0f..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>shell_cmd_0</name>
-            <value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
-            </value>
-            <description>the shell command that downloads and puts to hdfs orcid summaries</description>
-        </property>
-    </parameters>
-    
-    <start to="ResetWorkingPath"/>
-    
-    
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-    
-    <action name="ResetWorkingPath">
-        <fs>
-            <delete path='${workingPath}/summaries/output'/>
-            <mkdir path='${workingPath}/summaries/output'/>
-        </fs>
-        <ok to="check_exist_on_hdfs_summaries"/>
-        <error to="Kill"/>
-    </action>
-
-    <decision name="check_exist_on_hdfs_summaries">
-        <switch>
-            <case to="ImportOrcidSummaries">
-                ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
-            </case>
-            <default to="DownloadSummaries" />
-        </switch>
-    </decision>
-
-    <action name="DownloadSummaries">
-        <shell xmlns="uri:oozie:shell-action:0.1">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <exec>bash</exec>
-            <argument>-c</argument>
-            <argument>${shell_cmd_0}</argument>
-            <capture-output/>
-        </shell>
-        <ok to="ImportOrcidSummaries"/>
-        <error to="Kill"/>
-    </action>
-    
-    <action name="ImportOrcidSummaries">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
-            <arg>-w</arg><arg>${workingPath}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
-            <arg>-o</arg><arg>summaries/output/</arg>
-        </java>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-    <end name="End"/>
-</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
index 3068562d0..05fe6d014 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml
@@ -9,7 +9,7 @@
     </property>
     <property>
         <name>oozie.launcher.mapreduce.map.java.opts</name>
-        <value>-Xmx4g</value>
+        <value>-Xmx2g</value>
     </property>
     <property>
         <name>jobTracker</name>
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
index 8f9a5123e..ea4d33296 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml
@@ -1,4 +1,4 @@
-<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Gen Orcid Works-no-doi From Activities" xmlns="uri:oozie:workflow:0.5">
     <parameters>
         <property>
             <name>workingPath</name>
@@ -6,70 +6,70 @@
         </property>
         <property>
             <name>shell_cmd_0</name>
-            <value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
         </property>
         <property>
             <name>shell_cmd_1</name>
-            <value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
         </property>
         <property>
             <name>shell_cmd_2</name>
-            <value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
         </property>
         <property>
             <name>shell_cmd_3</name>
-            <value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
-        </property> 
+        </property>
         <property>
             <name>shell_cmd_4</name>
-            <value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
-        </property> 
+        </property>
         <property>
             <name>shell_cmd_5</name>
-            <value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
-        </property>  
+        </property>
         <property>
             <name>shell_cmd_6</name>
-            <value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
         </property>
         <property>
             <name>shell_cmd_7</name>
-            <value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
         </property>
         <property>
             <name>shell_cmd_8</name>
-            <value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
         </property>
         <property>
             <name>shell_cmd_9</name>
-            <value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
-        </property> 
+        </property>
         <property>
             <name>shell_cmd_X</name>
-            <value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
-        </property>  
+        </property>
     </parameters>
 
     <start to="ResetWorkingPath"/>
@@ -82,11 +82,11 @@
         <fs>
             <delete path='${workingPath}/no_doi_works/*'/>
         </fs>
-        <ok to="fork_gen_orcid_author_work"/>
+        <ok to="fork_check_download_files"/>
         <error to="Kill"/>
     </action>
     
-    <fork name = "fork_gen_orcid_author_work">
+    <fork name = "fork_check_download_files">
       <path start = "check_exist_on_hdfs_activities_0"/>
       <path start = "check_exist_on_hdfs_activities_1"/>
       <path start = "check_exist_on_hdfs_activities_2"/>
@@ -102,8 +102,8 @@
    	
     <decision name="check_exist_on_hdfs_activities_0">
          <switch>
-            <case to="GenOrcidAuthorWork_0">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
             </case>
             <default to="Download_0" />
          </switch>
@@ -118,7 +118,7 @@
 	    <argument>${shell_cmd_0}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_0"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -129,7 +129,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -139,8 +139,8 @@
     
     <decision name="check_exist_on_hdfs_activities_1">
          <switch>
-            <case to="GenOrcidAuthorWork_1">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
             </case>
             <default to="Download_1" />
          </switch>
@@ -155,7 +155,7 @@
 	    <argument>${shell_cmd_1}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_1"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -166,7 +166,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -176,8 +176,8 @@
     
     <decision name="check_exist_on_hdfs_activities_2">
          <switch>
-            <case to="GenOrcidAuthorWork_2">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
             </case>
             <default to="Download_2" />
          </switch>
@@ -192,7 +192,7 @@
 	    <argument>${shell_cmd_2}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_2"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -203,7 +203,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -213,8 +213,8 @@
     
     <decision name="check_exist_on_hdfs_activities_3">
          <switch>
-            <case to="GenOrcidAuthorWork_3">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
             </case>
             <default to="Download_3" />
          </switch>
@@ -229,7 +229,7 @@
 	    <argument>${shell_cmd_3}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_3"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -240,7 +240,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -250,8 +250,8 @@
     
     <decision name="check_exist_on_hdfs_activities_4">
          <switch>
-            <case to="GenOrcidAuthorWork_4">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
             </case>
             <default to="Download_4" />
          </switch>
@@ -266,7 +266,7 @@
 	    <argument>${shell_cmd_4}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_4"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -277,7 +277,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -287,8 +287,8 @@
     
     <decision name="check_exist_on_hdfs_activities_5">
          <switch>
-            <case to="GenOrcidAuthorWork_5">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
             </case>
             <default to="Download_5" />
          </switch>
@@ -303,7 +303,7 @@
 	    <argument>${shell_cmd_5}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_5"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -314,7 +314,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -324,8 +324,8 @@
     
     <decision name="check_exist_on_hdfs_activities_6">
          <switch>
-            <case to="GenOrcidAuthorWork_6">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
             </case>
             <default to="Download_6" />
          </switch>
@@ -340,7 +340,7 @@
 	    <argument>${shell_cmd_6}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_6"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -351,7 +351,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -362,8 +362,8 @@
     
     <decision name="check_exist_on_hdfs_activities_7">
          <switch>
-            <case to="GenOrcidAuthorWork_7">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
             </case>
             <default to="Download_7" />
          </switch>
@@ -378,7 +378,7 @@
 	    <argument>${shell_cmd_7}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_7"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -389,7 +389,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -399,8 +399,8 @@
     
     <decision name="check_exist_on_hdfs_activities_8">
          <switch>
-            <case to="GenOrcidAuthorWork_8">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
             </case>
             <default to="Download_8" />
          </switch>
@@ -415,7 +415,7 @@
 	    <argument>${shell_cmd_8}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_8"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -426,7 +426,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -436,8 +436,8 @@
     
     <decision name="check_exist_on_hdfs_activities_9">
          <switch>
-            <case to="GenOrcidAuthorWork_9">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
             </case>
             <default to="Download_9" />
          </switch>
@@ -452,7 +452,7 @@
 	    <argument>${shell_cmd_9}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_9"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -463,7 +463,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -473,8 +473,8 @@
     
     <decision name="check_exist_on_hdfs_activities_X">
          <switch>
-            <case to="GenOrcidAuthorWork_X">
-              ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
+            <case to="wait_download_phase_node">
+              ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
             </case>
             <default to="Download_X" />
          </switch>
@@ -489,7 +489,7 @@
 	    <argument>${shell_cmd_X}</argument>
 		<capture-output/>
 		</shell>
-	<ok to="GenOrcidAuthorWork_X"/>
+	<ok to="wait_download_phase_node"/>
 	<error to="Kill"/>
 	</action>
 	
@@ -500,7 +500,7 @@
             <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
             <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
             <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
         </java>
@@ -508,7 +508,35 @@
         <error to="Kill"/>
     </action>
 
+    <join name = "wait_download_phase_node" to = "fork_gen_orcid_author_work"/>
+
+    <fork name = "fork_gen_orcid_author_work">
+        <path start = "GenOrcidAuthorWork_0"/>
+        <path start = "GenOrcidAuthorWork_1"/>
+        <path start = "GenOrcidAuthorWork_2"/>
+        <path start = "GenOrcidAuthorWork_3"/>
+        <path start = "GenOrcidAuthorWork_4"/>
+        <path start = "GenOrcidAuthorWork_5"/>
+        <path start = "GenOrcidAuthorWork_6"/>
+        <path start = "GenOrcidAuthorWork_7"/>
+        <path start = "GenOrcidAuthorWork_8"/>
+        <path start = "GenOrcidAuthorWork_9"/>
+        <path start = "GenOrcidAuthorWork_X"/>
+    </fork>
+
     <join name = "join_node" to = "End"/>
-    
+
+<!--    <join name = "join_node" to = "fork_gen_orcid_author_work_2"/>-->
+
+<!--    <fork name = "fork_gen_orcid_author_work_2">-->
+<!--        <path start = "GenOrcidAuthorWork_6"/>-->
+<!--        <path start = "GenOrcidAuthorWork_7"/>-->
+<!--        <path start = "GenOrcidAuthorWork_8"/>-->
+<!--        <path start = "GenOrcidAuthorWork_9"/>-->
+<!--        <path start = "GenOrcidAuthorWork_X"/>-->
+<!--    </fork>-->
+
+<!--    <join name = "join_node_2" to = "End"/>-->
+
    <end name="End"/>
 </workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
index e77dd09c9..e1829e847 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
@@ -19,4 +19,8 @@
         <name>oozie.launcher.mapreduce.user.classpath.first</name>
         <value>true</value>
     </property>
+    <property>
+        <name>oozie.launcher.mapreduce.map.java.opts</name>
+        <value>-Xmx16g</value>
+    </property>
 </configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
index 3362cc67b..8517f35ee 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml
@@ -1,4 +1,4 @@
-<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Gen Orcid Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
     <parameters>
         <property>
             <name>workingPath</name>
@@ -6,7 +6,7 @@
         </property>
         <property>
             <name>shell_cmd_0</name>
-            <value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
+            <value>wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
             </value>
             <description>the shell command that downloads and puts to hdfs orcid summaries</description>
         </property>
@@ -21,8 +21,8 @@
     
     <action name="ResetWorkingPath">
         <fs>
-            <delete path='${workingPath}/summaries/output'/>
-            <mkdir path='${workingPath}/summaries/output'/>
+            <delete path='${workingPath}/authors'/>
+            <mkdir path='${workingPath}/authors'/>
         </fs>
         <ok to="check_exist_on_hdfs_summaries"/>
         <error to="Kill"/>
@@ -31,7 +31,7 @@
     <decision name="check_exist_on_hdfs_summaries">
         <switch>
             <case to="ImportOrcidSummaries">
-                ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
+                ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
             </case>
             <default to="DownloadSummaries" />
         </switch>
@@ -57,8 +57,8 @@
             <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
             <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
-            <arg>-o</arg><arg>summaries/output/</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
+            <arg>-o</arg><arg>authors/</arg>
         </java>
         <ok to="End"/>
         <error to="Kill"/>
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
similarity index 95%
rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
index faed3104a..6cec48a6d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
@@ -59,7 +59,7 @@
     
     <action name="ResetWorkingPath">
         <fs>
-            <delete path='${workingPath}/no_doi_enriched_works/output'/>
+            <delete path='${workingPath}/no_doi_dataset'/>
         </fs>
         <ok to="GenOrcidNoDoiDataset"/>
         <error to="Kill"/>
@@ -85,7 +85,7 @@
             <arg>-n</arg><arg>${nameNode}</arg>
             <arg>-f</arg><arg>-</arg>
             <arg>-ow</arg><arg>no_doi_works/</arg>
-            <arg>-oew</arg><arg>no_doi_enriched_works/output</arg>
+            <arg>-oew</arg><arg>no_doi_dataset</arg>
         </spark>
         <ok to="End"/>
         <error to="Kill"/>
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index 5e0f91ecd..774475626 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -38,8 +38,8 @@ public class OrcidClientTest {
 
 	@Test
 	public void downloadTest() throws Exception {
-		String record = testDownloadRecord("0000-0002-2536-4498");
-		File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
+		String record = testDownloadRecord("0000-0001-6163-2042");
+		File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml");
 		OutputStream outStream = new FileOutputStream(f);
 		IOUtils.write(record.getBytes(), outStream);
 		System.out.println("saved to tmp");
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index bf5aba99b..fa2980ac4 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -2,15 +2,20 @@
 package eu.dnetlib.doiboost.orcidnodoi.xml;
 
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
 import java.text.Normalizer;
 import java.util.*;
 
+import javax.validation.constraints.AssertTrue;
+
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.text.similarity.JaccardSimilarity;
 import org.apache.commons.text.similarity.JaroWinklerSimilarity;
 import org.junit.jupiter.api.Test;
+import org.mortbay.log.Log;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -41,7 +46,6 @@ public class OrcidNoDoiTest {
 	String orcidIdA = "0000-0003-2760-1191";
 
 	@Test
-//	@Ignore
 	public void readPublicationFieldsTest()
 		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
 		logger.info("running loadPublicationFieldsTest ....");
@@ -95,8 +99,7 @@ public class OrcidNoDoiTest {
 	}
 
 	@Test
-//	@Ignore
-	private void authorMatchTest() throws Exception {
+	public void authorMatchTest() throws Exception {
 		logger.info("running authorSimpleMatchTest ....");
 		String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
 		AuthorData author = new AuthorData();
@@ -121,9 +124,60 @@ public class OrcidNoDoiTest {
 			logger.error("parsing xml", e);
 		}
 		assertNotNull(workData);
+
+		Contributor a = workData.getContributors().get(0);
+		assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
+
 		AuthorMatcher.match(author, workData.getContributors());
 		GsonBuilder builder = new GsonBuilder();
 		Gson gson = builder.create();
 		logger.info(gson.toJson(workData));
+
+		assertTrue(workData.getContributors().size() == 6);
+		Contributor c = workData.getContributors().get(0);
+		assertTrue(c.getOid().equals("0000-0003-2760-1191"));
+		assertTrue(c.getName().equals("Khairy"));
+		assertTrue(c.getSurname().equals("Abdel Dayem"));
+		assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
+	}
+
+	@Test
+	public void readContributorsTest()
+		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+		logger.info("running loadPublicationFieldsTest ....");
+		String xml = IOUtils
+			.toString(
+				OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
+
+		if (xml == null) {
+			logger.info("Resource not found");
+		}
+		XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+		if (p == null) {
+			logger.info("XMLRecordParserNoDoi null");
+		}
+		WorkDataNoDoi workData = null;
+		try {
+			workData = p.VTDParseWorkData(xml.getBytes());
+		} catch (Exception e) {
+			logger.error("parsing xml", e);
+		}
+		assertNotNull(workData.getContributors());
+		assertTrue(workData.getContributors().size() == 5);
+		assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
+		assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
+		assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
+		assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
+		assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
+		assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
+		assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
+		assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
+		assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
+		assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
+		assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
+		assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
+		assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
+		assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
+		assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
 	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml
new file mode 100644
index 000000000..26e64aeda
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191_contributors.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<work:work xmlns:address="http://www.orcid.org/ns/address"
+           xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
+           xmlns:employment="http://www.orcid.org/ns/employment"
+           xmlns:education="http://www.orcid.org/ns/education"
+           xmlns:other-name="http://www.orcid.org/ns/other-name"
+           xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+           xmlns:funding="http://www.orcid.org/ns/funding"
+           xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+           xmlns:service="http://www.orcid.org/ns/service"
+           xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+           xmlns:distinction="http://www.orcid.org/ns/distinction"
+           xmlns:internal="http://www.orcid.org/ns/internal"
+           xmlns:membership="http://www.orcid.org/ns/membership"
+           xmlns:person="http://www.orcid.org/ns/person"
+           xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+           xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+           xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+           xmlns:activities="http://www.orcid.org/ns/activities"
+           xmlns:qualification="http://www.orcid.org/ns/qualification"
+           xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+           xmlns:error="http://www.orcid.org/ns/error"
+           xmlns:preferences="http://www.orcid.org/ns/preferences"
+           xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+           xmlns:work="http://www.orcid.org/ns/work"
+           xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
+           path="/0000-0003-2760-1191/work/28776099" visibility="public">
+    <common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
+    <common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
+    <common:source>
+        <common:source-orcid>
+            <common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
+            <common:path>0000-0002-9157-3431</common:path>
+            <common:host>orcid.org</common:host>
+        </common:source-orcid>
+        <common:source-name>Europe PubMed Central</common:source-name>
+    </common:source>
+    <work:title>
+        <common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
+            Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
+            ST-Segment-Elevation Myocardial Infarction.</common:title>
+    </work:title>
+    <work:citation>
+        <work:citation-type>formatted-unspecified</work:citation-type>
+        <work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
+            Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
+    </work:citation>
+    <work:type>journal-article</work:type>
+    <common:publication-date>
+        <common:year>2016</common:year>
+        <common:month>11</common:month>
+    </common:publication-date>
+    <common:external-ids>
+        <common:external-id>
+            <common:external-id-type>pmid</common:external-id-type>
+            <common:external-id-value>27899851</common:external-id-value>
+            <common:external-id-normalized transient="true">27899851</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+        <common:external-id>
+            <common:external-id-type>pmc</common:external-id-type>
+            <common:external-id-value>PMC5126442</common:external-id-value>
+            <common:external-id-normalized transient="true"
+            >PMC5126442</common:external-id-normalized>
+            <common:external-id-relationship>self</common:external-id-relationship>
+        </common:external-id>
+    </common:external-ids>
+    <common:url>http://europepmc.org/abstract/med/27899851</common:url>
+    <work:contributors>
+        <work:contributor>
+            <work:contributor-attributes>
+                <work:contributor-sequence>seq0</work:contributor-sequence>
+                <work:contributor-role>role0</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>creditname1</work:credit-name>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>creditname2</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>seq2</work:contributor-sequence>
+                <work:contributor-role></work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name>creditname3</work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence></work:contributor-sequence>
+                <work:contributor-role>role3</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+        <work:contributor>
+            <work:credit-name></work:credit-name>
+            <work:contributor-attributes>
+                <work:contributor-sequence>seq4</work:contributor-sequence>
+                <work:contributor-role>role4</work:contributor-role>
+            </work:contributor-attributes>
+        </work:contributor>
+    </work:contributors>
+</work:work>
diff --git a/pom.xml b/pom.xml
index d64de01ac..3629e2f1b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -458,6 +458,18 @@
 				<version>${jsonschemagenerator.version}</version>
 			</dependency>
 
+			<dependency>
+				<groupId>org.apache.commons</groupId>
+				<artifactId>commons-text</artifactId>
+				<version>${common.text.version}</version>
+			</dependency>
+
+			<dependency>
+				<groupId>org.apache.httpcomponents</groupId>
+				<artifactId>httpclient</artifactId>
+				<version>${org.apache.httpcomponents.version}</version>
+			</dependency>
+
 		</dependencies>
 	</dependencyManagement>
 

From 1513174d7ec367222c063ba47095ee7ca4897e99 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 10 Nov 2020 11:44:55 +0100
Subject: [PATCH 25/34] added further test case

---
 .../SparkGenEnrichedOrcidWorks.java           |   2 +-
 .../orcidnodoi/similarity/AuthorMatcher.java  |  50 +++--
 .../orcidnodoi/xml/OrcidNoDoiTest.java        | 181 ++++++++++++++++--
 .../xml/activity_work_0000-0003-2760-1191.xml |   2 +-
 4 files changed, 202 insertions(+), 33 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 691ca3eee..40cd212da 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -96,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks {
 						Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
 					.filter(Objects::nonNull)
 					.toJavaRDD();
-//				enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
+				enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/");
 				logger.info("Enriched works RDD ready.");
 
 				final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
index 6a1468f4c..2f86820fb 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -33,7 +33,7 @@ import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 public class AuthorMatcher {
 
 	private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
-	private static final Double threshold = 0.8;
+	public static final Double threshold = 0.8;
 
 	public static void match(AuthorData author, List<Contributor> contributors)
 		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
@@ -41,16 +41,35 @@ public class AuthorMatcher {
 		int matchCounter = 0;
 		List<Integer> matchCounters = Arrays.asList(matchCounter);
 		Contributor contributor = null;
-		contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
-			if (simpleMatch(c.getCreditName(), author.getName()) ||
-				simpleMatch(c.getCreditName(), author.getSurname()) ||
-				simpleMatch(c.getCreditName(), author.getOtherName())) {
-				matchCounters.set(0, matchCounters.get(0) + 1);
-				c.setSimpleMatch(true);
-			}
-		});
+		contributors
+			.stream()
+			.filter(c -> !StringUtils.isBlank(c.getCreditName()))
+			.forEach(c -> {
+				if (simpleMatch(c.getCreditName(), author.getName()) ||
+					simpleMatch(c.getCreditName(), author.getSurname()) ||
+					simpleMatch(c.getCreditName(), author.getOtherName())) {
+					matchCounters.set(0, matchCounters.get(0) + 1);
+					c.setSimpleMatch(true);
+				}
+			});
 		if (matchCounters.get(0) == 1) {
 			updateAuthorsSimpleMatch(contributors, author);
+		} else if (matchCounters.get(0) == 0) {
+			Optional<Contributor> optCon = contributors
+				.stream()
+				.filter(c -> !StringUtils.isBlank(c.getCreditName()))
+				.map(c -> {
+					c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
+					return c;
+				})
+				.filter(c -> c.getScore() >= threshold)
+				.max(Comparator.comparing(c -> c.getScore()));
+			Contributor bestMatchContributor = null;
+			if (optCon.isPresent()) {
+				bestMatchContributor = optCon.get();
+				bestMatchContributor.setBestMatch(true);
+				updateAuthorsSimilarityMatch(contributors, author);
+			}
 		} else if (matchCounters.get(0) > 1) {
 			Optional<Contributor> optCon = contributors
 				.stream()
@@ -68,19 +87,18 @@ public class AuthorMatcher {
 				bestMatchContributor.setBestMatch(true);
 				updateAuthorsSimilarityMatch(contributors, author);
 			}
-
 		}
 
 	}
 
-	private static boolean simpleMatch(String name, String searchValue) {
+	public static boolean simpleMatch(String name, String searchValue) {
 		if (searchValue == null) {
 			return false;
 		}
 		return normalize(name).contains(normalize(searchValue));
 	}
 
-	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
+	public static Double bestMatch(String authorSurname, String authorName, String contributor) {
 		String[] contributorSplitted = contributor.split(" ");
 		if (contributorSplitted.length == 0) {
 			return 0.0;
@@ -106,7 +124,7 @@ public class AuthorMatcher {
 		return sm2;
 	}
 
-	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
+	public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
 		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
 		return score;
 	}
@@ -115,7 +133,7 @@ public class AuthorMatcher {
 		return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
 	}
 
-	private static String normalize(final String s) {
+	public static String normalize(final String s) {
 		if (s == null) {
 			return new String("");
 		}
@@ -140,7 +158,7 @@ public class AuthorMatcher {
 		return surname + " " + name;
 	}
 
-	private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
+	public static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
 		contributors.forEach(c -> {
 			if (c.isSimpleMatch()) {
 				c.setName(author.getName());
@@ -151,7 +169,7 @@ public class AuthorMatcher {
 		updateRanks(contributors);
 	}
 
-	private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
+	public static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
 		contributors
 			.stream()
 			.filter(c -> c.isBestMatch())
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index fa2980ac4..c2c4ed5e1 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -38,12 +38,9 @@ public class OrcidNoDoiTest {
 
 	private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
 
-	String nameA = "Khairy";
-	String surnameA = "Abdel Dayem";
-	String otherNameA = "Dayem MKA";
-	String nameB = "K";
-	String surnameB = "Abdel-Dayem";
-	String orcidIdA = "0000-0003-2760-1191";
+	static String nameA = "Khairy";
+	static String surnameA = "Abdel Dayem";
+	static String orcidIdA = "0000-0003-2760-1191";
 
 	@Test
 	public void readPublicationFieldsTest()
@@ -99,7 +96,7 @@ public class OrcidNoDoiTest {
 	}
 
 	@Test
-	public void authorMatchTest() throws Exception {
+	public void authorDoubleMatchTest() throws Exception {
 		logger.info("running authorSimpleMatchTest ....");
 		String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
 		AuthorData author = new AuthorData();
@@ -129,16 +126,8 @@ public class OrcidNoDoiTest {
 		assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
 
 		AuthorMatcher.match(author, workData.getContributors());
-		GsonBuilder builder = new GsonBuilder();
-		Gson gson = builder.create();
-		logger.info(gson.toJson(workData));
 
 		assertTrue(workData.getContributors().size() == 6);
-		Contributor c = workData.getContributors().get(0);
-		assertTrue(c.getOid().equals("0000-0003-2760-1191"));
-		assertTrue(c.getName().equals("Khairy"));
-		assertTrue(c.getSurname().equals("Abdel Dayem"));
-		assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
 	}
 
 	@Test
@@ -180,4 +169,166 @@ public class OrcidNoDoiTest {
 		assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
 		assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
 	}
+
+	@Test
+	public void authorSimpleMatchTest() throws Exception {
+		String orcidWork = "activity_work_0000-0002-5982-8983.xml";
+		AuthorData author = new AuthorData();
+		author.setName("Parkhouse");
+		author.setSurname("H.");
+		author.setOid("0000-0002-5982-8983");
+		String xml = IOUtils
+			.toString(
+				OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
+
+		if (xml == null) {
+			logger.info("Resource not found");
+		}
+		XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+		if (p == null) {
+			logger.info("XMLRecordParserNoDoi null");
+		}
+		WorkDataNoDoi workData = null;
+		try {
+			workData = p.VTDParseWorkData(xml.getBytes());
+		} catch (Exception e) {
+			logger.error("parsing xml", e);
+		}
+		assertNotNull(workData);
+
+		Contributor a = workData.getContributors().get(0);
+		assertTrue(a.getCreditName().equals("Parkhouse, H."));
+
+		AuthorMatcher.match(author, workData.getContributors());
+
+		assertTrue(workData.getContributors().size() == 2);
+		Contributor c = workData.getContributors().get(0);
+		assertTrue(c.getOid().equals("0000-0002-5982-8983"));
+		assertTrue(c.getName().equals("Parkhouse"));
+		assertTrue(c.getSurname().equals("H."));
+		assertTrue(c.getCreditName().equals("Parkhouse, H."));
+	}
+
+	@Test
+	public void match() {
+
+		AuthorData author = new AuthorData();
+		author.setName("Joe");
+		author.setSurname("Dodge");
+		author.setOid("0000-1111-2222-3333");
+		Contributor contributor = new Contributor();
+		contributor.setCreditName("Joe Dodge");
+		List<Contributor> contributors = Arrays.asList(contributor);
+		AuthorMatcher am = new AuthorMatcher();
+		int matchCounter = 0;
+		List<Integer> matchCounters = Arrays.asList(matchCounter);
+		contributors
+			.stream()
+			.filter(c -> !StringUtils.isBlank(c.getCreditName()))
+			.forEach(c -> {
+				if (am.simpleMatch(c.getCreditName(), author.getName()) ||
+					am.simpleMatch(c.getCreditName(), author.getSurname()) ||
+					am.simpleMatch(c.getCreditName(), author.getOtherName())) {
+					matchCounters.set(0, matchCounters.get(0) + 1);
+					c.setSimpleMatch(true);
+				}
+			});
+
+		assertTrue(matchCounters.get(0) == 1);
+		am.updateAuthorsSimpleMatch(contributors, author);
+		assertTrue(contributors.get(0).getName().equals("Joe"));
+		assertTrue(contributors.get(0).getSurname().equals("Dodge"));
+		assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge"));
+		assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
+
+		AuthorData authorX = new AuthorData();
+		authorX.setName(nameA);
+		authorX.setSurname(surnameA);
+		authorX.setOid(orcidIdA);
+		Contributor contributorA = new Contributor();
+		contributorA.setCreditName("Abdel-Dayem Khai");
+		Contributor contributorB = new Contributor();
+		contributorB.setCreditName("Abdel-Dayem Fake");
+		List<Contributor> contributorList = new ArrayList<>();
+		contributorList.add(contributorA);
+		contributorList.add(contributorB);
+		int matchCounter2 = 0;
+		List<Integer> matchCounters2 = Arrays.asList(matchCounter2);
+		contributorList
+			.stream()
+			.filter(c -> !StringUtils.isBlank(c.getCreditName()))
+			.forEach(c -> {
+				if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
+					am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
+					am.simpleMatch(c.getCreditName(), authorX.getOtherName())) {
+					int currentCounter = matchCounters2.get(0);
+					currentCounter += 1;
+					matchCounters2.set(0, currentCounter);
+					c.setSimpleMatch(true);
+				}
+			});
+
+		assertTrue(matchCounters2.get(0) == 2);
+		assertTrue(contributorList.get(0).isSimpleMatch());
+		assertTrue(contributorList.get(1).isSimpleMatch());
+
+		Optional<Contributor> optCon = contributorList
+			.stream()
+			.filter(c -> c.isSimpleMatch())
+			.filter(c -> !StringUtils.isBlank(c.getCreditName()))
+			.map(c -> {
+				c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName()));
+				return c;
+			})
+			.filter(c -> c.getScore() >= AuthorMatcher.threshold)
+			.max(Comparator.comparing(c -> c.getScore()));
+		assertTrue(optCon.isPresent());
+
+		final Contributor bestMatchContributor = optCon.get();
+		bestMatchContributor.setBestMatch(true);
+		assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai"));
+		assertTrue(contributorList.get(0).isBestMatch());
+		assertTrue(!contributorList.get(1).isBestMatch());
+		am.updateAuthorsSimilarityMatch(contributorList, authorX);
+		assertTrue(contributorList.get(0).getName().equals(nameA));
+		assertTrue(contributorList.get(0).getSurname().equals(surnameA));
+		assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai"));
+		assertTrue(contributorList.get(0).getOid().equals(orcidIdA));
+		assertTrue(StringUtils.isBlank(contributorList.get(1).getOid()));
+	}
+
+	@Test
+	public void authorBestMatchTest() throws Exception {
+		String name = "Khairy";
+		String surname = "Abdel Dayem";
+		String orcidWork = "activity_work_0000-0003-2760-1191.xml";
+		AuthorData author = new AuthorData();
+		author.setName(name);
+		author.setSurname(surname);
+		author.setOid(orcidIdA);
+		String xml = IOUtils
+			.toString(
+				OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
+
+		if (xml == null) {
+			logger.info("Resource not found");
+		}
+		XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
+		if (p == null) {
+			logger.info("XMLRecordParserNoDoi null");
+		}
+		WorkDataNoDoi workData = null;
+		try {
+			workData = p.VTDParseWorkData(xml.getBytes());
+		} catch (Exception e) {
+			logger.error("parsing xml", e);
+		}
+		AuthorMatcher.match(author, workData.getContributors());
+		assertTrue(workData.getContributors().size() == 5);
+		List<Contributor> c = workData.getContributors();
+		assertTrue(c.get(0).getName().equals(name));
+		assertTrue(c.get(0).getSurname().equals(surname));
+		assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
+		assertTrue(c.get(0).getOid().equals(orcidIdA));
+	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
index 485f4f8e8..83752b145 100644
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml
@@ -68,7 +68,7 @@
     <common:url>http://europepmc.org/abstract/med/27899851</common:url>
     <work:contributors>
         <work:contributor>
-            <work:credit-name>Abdel-Dayem K</work:credit-name>
+            <work:credit-name>Khair Abde Daye</work:credit-name>
             <work:contributor-attributes>
                 <work:contributor-sequence>first</work:contributor-sequence>
                 <work:contributor-role>author</work:contributor-role>

From 1f861f2b0de77d6a693c5d5144696409c32592a7 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Wed, 11 Nov 2020 17:38:50 +0100
Subject: [PATCH 26/34] now wf output is a sequence file with the format
 seq("eu.dnetlib.dhp.schema.oaf.Publication",eu.dnetlib.dhp.schema.action.AtomicActions)

---
 .../SparkGenEnrichedOrcidWorks.java           | 62 ++++++++++---------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 40cd212da..7f715fa7d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -1,14 +1,21 @@
 
 package eu.dnetlib.doiboost.orcidnodoi;
 
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
-import java.io.IOException;
-import java.util.Objects;
-import java.util.Optional;
-
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.doiboost.orcid.json.JsonHelper;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@@ -16,24 +23,17 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import com.google.gson.Gson;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonParser;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.doiboost.orcid.json.JsonHelper;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
-import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
-import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
-import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
 import scala.Tuple2;
 
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
 /**
  * This spark job generates one parquet file, containing orcid publications dataset
  */
@@ -42,6 +42,8 @@ public class SparkGenEnrichedOrcidWorks {
 
 	static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
 
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
 	public static void main(String[] args) throws IOException, Exception {
 
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@@ -58,6 +60,7 @@ public class SparkGenEnrichedOrcidWorks {
 		final String workingPath = parser.get("workingPath");
 		final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
 		final String outputWorksPath = parser.get("outputWorksPath");
+		final String hdfsServerUri = parser.get("hdfsServerUri");
 
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
@@ -96,7 +99,6 @@ public class SparkGenEnrichedOrcidWorks {
 						Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
 					.filter(Objects::nonNull)
 					.toJavaRDD();
-				enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/");
 				logger.info("Enriched works RDD ready.");
 
 				final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
@@ -124,15 +126,17 @@ public class SparkGenEnrichedOrcidWorks {
 						})
 					.filter(p -> p != null);
 
-				Dataset<Publication> publicationDataset = spark
-					.createDataset(
-						oafPublicationRDD.repartition(1).rdd(),
-						Encoders.bean(Publication.class));
-				publicationDataset
-					.write()
-					.format("parquet")
-					.mode(SaveMode.Overwrite)
-					.save(workingPath + outputEnrichedWorksPath);
+				oafPublicationRDD
+					.mapToPair(
+						p -> new Tuple2<>(p.getClass().toString(),
+							OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, (Publication) p))))
+					.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
+					.saveAsNewAPIHadoopFile(
+						workingPath.concat(outputEnrichedWorksPath),
+						Text.class,
+						Text.class,
+						SequenceFileOutputFormat.class,
+						sc.hadoopConfiguration());
 
 				logger.info("parsedPublications: " + parsedPublications.value().toString());
 				logger.info("enrichedPublications: " + enrichedPublications.value().toString());

From 13f28fa225d248e080c13d99694c3069826b3184 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Thu, 12 Nov 2020 17:43:32 +0100
Subject: [PATCH 27/34] moved AuthorData to dhp-schemas; added other names to
 author data

---
 .../dnetlib/dhp/schema/orcid}/AuthorData.java |  18 +-
 .../orcid/SparkGenerateDoiAuthorList.java     |   5 +-
 .../doiboost/orcid/SummariesDecompressor.java |   7 +-
 .../doiboost/orcid/xml/XMLRecordParser.java   |   8 +-
 .../SparkGenEnrichedOrcidWorks.java           |  38 +-
 .../doiboost/orcidnodoi/json/JsonWriter.java  |  19 +-
 .../orcidnodoi/model/Contributor.java         |   2 +-
 .../orcidnodoi/similarity/AuthorMatcher.java  |  11 +-
 .../oozie_app/config-default.xml              |   2 +-
 .../orcid/xml/XMLRecordParserTest.java        |  21 +-
 .../orcidnodoi/xml/OrcidNoDoiTest.java        |  12 +-
 .../orcid/xml/record_8888-8888-8888-8880.xml  | 770 ++++++++++++++++++
 .../summary_0000-0001-5109-1000_othername.xml | 196 +++++
 13 files changed, 1053 insertions(+), 56 deletions(-)
 rename {dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid}/AuthorData.java (71%)
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorData.java
similarity index 71%
rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorData.java
index e0624509b..6c94cdb13 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorData.java
@@ -1,7 +1,10 @@
 
-package eu.dnetlib.doiboost.orcid.model;
+package eu.dnetlib.dhp.schema.orcid;
 
 import java.io.Serializable;
+import java.util.List;
+
+import com.google.common.collect.Lists;
 
 /**
  * This class models the data that are retrieved from orcid publication
@@ -13,8 +16,8 @@ public class AuthorData implements Serializable {
 	private String name;
 	private String surname;
 	private String creditName;
-	private String otherName;
 	private String errorCode;
+	private List<String> otherNames;
 
 	public String getErrorCode() {
 		return errorCode;
@@ -56,11 +59,14 @@ public class AuthorData implements Serializable {
 		this.oid = oid;
 	}
 
-	public String getOtherName() {
-		return otherName;
+	public List<String> getOtherNames() {
+		return otherNames;
 	}
 
-	public void setOtherName(String otherName) {
-		this.otherName = otherName;
+	public void setOtherNames(List<String> otherNames) {
+		if (this.otherNames == null) {
+			this.otherNames = Lists.newArrayList();
+		}
+		this.otherNames = otherNames;
 	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
index b4239bba2..011c153ec 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
@@ -13,9 +13,6 @@ import java.util.stream.Stream;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@@ -33,7 +30,7 @@ import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import scala.Tuple2;
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
index 29d72ed0b..d1b2a1d73 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
@@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;
 
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
 import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 
@@ -56,6 +56,7 @@ public class SummariesDecompressor {
 		int nameFound = 0;
 		int surnameFound = 0;
 		int creditNameFound = 0;
+		int otherNamesFound = 0;
 		int errorFromOrcidFound = 0;
 		int xmlParserErrorFound = 0;
 		try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
@@ -117,6 +118,9 @@ public class SummariesDecompressor {
 								if (authorData.getCreditName() != null) {
 									creditNameFound += 1;
 								}
+								if (authorData.getOtherNames() != null && authorData.getOtherNames().size() > 1) {
+									otherNamesFound += authorData.getOtherNames().size();
+								}
 
 							} else {
 								Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
@@ -152,6 +156,7 @@ public class SummariesDecompressor {
 		Log.info("Name found: " + nameFound);
 		Log.info("Surname found: " + surnameFound);
 		Log.info("Credit name found: " + creditNameFound);
+		Log.info("Other names found: " + otherNamesFound);
 		Log.info("Error from Orcid found: " + errorFromOrcidFound);
 		Log.info("Error parsing xml record found: " + xmlParserErrorFound);
 	}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
index 2e43f4d3e..a807cf132 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
@@ -14,7 +14,7 @@ import com.ximpleware.VTDNav;
 
 import eu.dnetlib.dhp.parser.utility.VtdException;
 import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 
 public class XMLRecordParser {
@@ -81,6 +81,12 @@ public class XMLRecordParser {
 		if (!creditNames.isEmpty()) {
 			authorData.setCreditName(creditNames.get(0));
 		}
+
+		final List<String> otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content");
+		if (!otherNames.isEmpty()) {
+			authorData.setOtherNames(otherNames);
+		}
+
 		return authorData;
 	}
 
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index 7f715fa7d..cc65b0b4f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -1,18 +1,12 @@
 
 package eu.dnetlib.doiboost.orcidnodoi;
 
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.gson.Gson;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonParser;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.action.AtomicAction;
-import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.doiboost.orcid.json.JsonHelper;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
-import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
-import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
-import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Optional;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
@@ -26,14 +20,22 @@ import org.apache.spark.sql.Encoders;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
+import eu.dnetlib.doiboost.orcid.json.JsonHelper;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
 import scala.Tuple2;
 
-import java.io.IOException;
-import java.util.Objects;
-import java.util.Optional;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
 /**
  * This spark job generates one parquet file, containing orcid publications dataset
  */
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
index 363cb13e6..982fb6316 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
@@ -1,9 +1,12 @@
 
 package eu.dnetlib.doiboost.orcidnodoi.json;
 
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.gson.JsonObject;
 
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 
 /**
@@ -12,15 +15,11 @@ import eu.dnetlib.doiboost.orcid.model.WorkData;
 
 public class JsonWriter {
 
-	public static String create(AuthorData authorData) {
-		JsonObject author = new JsonObject();
-		author.addProperty("oid", authorData.getOid());
-		author.addProperty("name", authorData.getName());
-		author.addProperty("surname", authorData.getSurname());
-		if (authorData.getCreditName() != null) {
-			author.addProperty("creditname", authorData.getCreditName());
-		}
-		return author.toString();
+	public static final com.fasterxml.jackson.databind.ObjectMapper OBJECT_MAPPER = new ObjectMapper()
+		.setSerializationInclusion(JsonInclude.Include.NON_NULL);;
+
+	public static String create(AuthorData authorData) throws JsonProcessingException {
+		return OBJECT_MAPPER.writeValueAsString(authorData);
 	}
 
 	public static String create(WorkData workData) {
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
index 9a8651c85..9222c1cc4 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
@@ -3,7 +3,7 @@ package eu.dnetlib.doiboost.orcidnodoi.model;
 
 import java.io.Serializable;
 
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 
 /**
  * This class models the data related to a contributor, that are retrieved from an orcid publication
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
index 2f86820fb..c0f617868 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@@ -18,7 +18,7 @@ import com.ximpleware.XPathEvalException;
 import com.ximpleware.XPathParseException;
 
 import eu.dnetlib.dhp.parser.utility.VtdException;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 
@@ -47,7 +47,7 @@ public class AuthorMatcher {
 			.forEach(c -> {
 				if (simpleMatch(c.getCreditName(), author.getName()) ||
 					simpleMatch(c.getCreditName(), author.getSurname()) ||
-					simpleMatch(c.getCreditName(), author.getOtherName())) {
+					simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
 					matchCounters.set(0, matchCounters.get(0) + 1);
 					c.setSimpleMatch(true);
 				}
@@ -91,6 +91,13 @@ public class AuthorMatcher {
 
 	}
 
+	public static boolean simpleMatchOnOtherNames(String name, List<String> otherNames) {
+		if (otherNames == null || (otherNames != null && otherNames.isEmpty())) {
+			return false;
+		}
+		return otherNames.stream().filter(o -> simpleMatch(name, o)).count() > 0;
+	}
+
 	public static boolean simpleMatch(String name, String searchValue) {
 		if (searchValue == null) {
 			return false;
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
index e1829e847..191654378 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml
@@ -21,6 +21,6 @@
     </property>
     <property>
         <name>oozie.launcher.mapreduce.map.java.opts</name>
-        <value>-Xmx16g</value>
+        <value>-Xmx8g</value>
     </property>
 </configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
index 5bf6f27b9..b7be5e5cd 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
@@ -2,12 +2,14 @@
 package eu.dnetlib.doiboost.orcid.xml;
 
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import org.apache.commons.io.IOUtils;
 import org.junit.jupiter.api.Test;
 
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 
 public class XMLRecordParserTest {
 
@@ -55,4 +57,21 @@ public class XMLRecordParserTest {
 		assertNotNull(workData.getDoi());
 		System.out.println("doi: " + workData.getDoi());
 	}
+
+	@Test
+	public void testOrcidOtherNamesXMLParser() throws Exception {
+
+		String xml = IOUtils
+			.toString(
+				this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml"));
+
+		XMLRecordParser p = new XMLRecordParser();
+
+		AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes());
+		assertNotNull(authorData);
+		assertNotNull(authorData.getOtherNames());
+		assertTrue(authorData.getOtherNames().get(0).equals("Andrew C. Porteus"));
+		String jsonData = JsonWriter.create(authorData);
+		assertNotNull(jsonData);
+	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index c2c4ed5e1..948e5b094 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -5,34 +5,24 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
-import java.text.Normalizer;
 import java.util.*;
 
-import javax.validation.constraints.AssertTrue;
-
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.text.similarity.JaccardSimilarity;
-import org.apache.commons.text.similarity.JaroWinklerSimilarity;
 import org.junit.jupiter.api.Test;
-import org.mortbay.log.Log;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
 import com.ximpleware.NavException;
 import com.ximpleware.ParseException;
 import com.ximpleware.XPathEvalException;
 import com.ximpleware.XPathParseException;
 
 import eu.dnetlib.dhp.parser.utility.VtdException;
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
 import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
 import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
-import jdk.nashorn.internal.ir.annotations.Ignore;
 
 public class OrcidNoDoiTest {
 
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml
new file mode 100644
index 000000000..7abc2f35a
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/record_8888-8888-8888-8880.xml
@@ -0,0 +1,770 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record:record path="/8888-8888-8888-8880" xmlns:activities="http://www.orcid.org/ns/activities"
+               xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+               xmlns:other-name="http://www.orcid.org/ns/other-name"
+               xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+               xmlns:email="http://www.orcid.org/ns/email" xmlns:address="http://www.orcid.org/ns/address"
+               xmlns:keyword="http://www.orcid.org/ns/keyword"
+               xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+               xmlns:employment="http://www.orcid.org/ns/employment" xmlns:common="http://www.orcid.org/ns/common"
+               xmlns:education="http://www.orcid.org/ns/education" xmlns:funding="http://www.orcid.org/ns/funding"
+               xmlns:history="http://www.orcid.org/ns/history" xmlns:person="http://www.orcid.org/ns/person"
+               xmlns:preferences="http://www.orcid.org/ns/preferences" xmlns:record="http://www.orcid.org/ns/record"
+               xmlns:peer-review="http://www.orcid.org/ns/peer-review" xmlns:work="http://www.orcid.org/ns/work"
+               xmlns:distinction="http://www.orcid.org/ns/distinction" xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+               xmlns:membership="http://www.orcid.org/ns/membership" xmlns:qualification="http://www.orcid.org/ns/qualification"
+               xmlns:service="http://www.orcid.org/ns/service"
+               xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+               xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+               xsi:schemaLocation="http://www.orcid.org/ns/record ../record-3.0.xsd ">
+    <common:orcid-identifier>
+        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+        <common:path>8888-8888-8888-8880</common:path>
+        <common:host>orcid.org</common:host>
+    </common:orcid-identifier>
+    <preferences:preferences>
+        <preferences:locale>zh_CN</preferences:locale>
+    </preferences:preferences>
+    <history:history visibility="private">
+        <history:creation-method>API</history:creation-method>
+        <history:completion-date>2001-12-31T12:00:00</history:completion-date>
+        <history:submission-date>2001-12-31T12:00:00</history:submission-date>
+        <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+        <history:claimed>true</history:claimed>
+        <common:source>
+            <common:source-orcid>
+                <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                <common:path>8888-8888-8888-8880</common:path>
+                <common:host>orcid.org</common:host>
+            </common:source-orcid>
+            <common:source-name />
+        </common:source>
+        <history:deactivation-date>2001-12-31T12:00:00</history:deactivation-date>
+        <history:verified-email>true</history:verified-email>
+        <history:verified-primary-email>true</history:verified-primary-email>
+    </history:history>
+    <person:person path="/8888-8888-8888-8880">
+        <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+        <person:name visibility="public"
+                     path="/8888-8888-8888-8880/personal-details">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <personal-details:given-names>give-names</personal-details:given-names>
+            <personal-details:family-name>family-name</personal-details:family-name>
+            <personal-details:credit-name>credit-name</personal-details:credit-name>
+        </person:name>
+        <other-name:other-names
+                path="/8888-8888-8888-8880/other-names">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <other-name:other-name visibility="public"
+                                   put-code="1" display-index="0">
+                <common:created-date>2001-12-31T12:00:00</common:created-date>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                        <common:path>8888-8888-8888-8880</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name />
+                </common:source>
+                <other-name:content>other-name-1</other-name:content>
+            </other-name:other-name>
+        </other-name:other-names>
+        <person:biography visibility="public"
+                          path="/8888-8888-8888-8880/biography">
+            <common:created-date>2001-12-31T12:00:00</common:created-date>
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <personal-details:content>biography</personal-details:content>
+        </person:biography>
+        <researcher-url:researcher-urls
+                path="/8888-8888-8888-8880/researcher-urls">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <researcher-url:researcher-url
+                    put-code="1248" visibility="public" display-index="0">
+                <common:created-date>2001-12-31T12:00:00</common:created-date>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                        <common:path>8888-8888-8888-8880</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name />
+                </common:source>
+                <researcher-url:url-name>url-name-1</researcher-url:url-name>
+                <researcher-url:url>http://url.com/</researcher-url:url>
+            </researcher-url:researcher-url>
+        </researcher-url:researcher-urls>
+        <email:emails path="/8888-8888-8888-8880/email">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <email:email visibility="public" put-code="0">
+                <common:created-date>2001-12-31T12:00:00</common:created-date>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                        <common:path>8888-8888-8888-8880</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name />
+                </common:source>
+                <email:email>user1@email.com</email:email>
+            </email:email>
+        </email:emails>
+        <address:addresses path="/8888-8888-8888-8880/address">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <address:address visibility="public" put-code="1"
+                             display-index="0">
+                <common:created-date>2001-12-31T12:00:00</common:created-date>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                        <common:path>8888-8888-8888-8880</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name />
+                </common:source>
+                <address:country>US</address:country>
+            </address:address>
+        </address:addresses>
+        <keyword:keywords path="/8888-8888-8888-8880/keywords">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <keyword:keyword visibility="public" put-code="1"
+                             display-index="0">
+                <common:created-date>2001-12-31T12:00:00</common:created-date>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                        <common:path>8888-8888-8888-8880</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name />
+                </common:source>
+                <keyword:content>keyword1</keyword:content>
+            </keyword:keyword>
+        </keyword:keywords>
+        <external-identifier:external-identifiers
+                path="/8888-8888-8888-8880/external-identifiers">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <external-identifier:external-identifier
+                    visibility="public" put-code="1" display-index="0">
+                <common:created-date>2001-12-31T12:00:00</common:created-date>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                        <common:path>8888-8888-8888-8880</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name />
+                </common:source>
+                <common:external-id-type>type-1</common:external-id-type>
+                <common:external-id-value>value-1</common:external-id-value>
+                <common:external-id-url>http://url.com/1</common:external-id-url>
+                <common:external-id-relationship>self</common:external-id-relationship>
+            </external-identifier:external-identifier>
+        </external-identifier:external-identifiers>
+    </person:person>
+    <activities:activities-summary>
+        <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+        <activities:distinctions path="/8888-8888-8888-8880/distinctions">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <distinction:distinction-summary put-code="0"
+                                                 visibility="private" display-index="0">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>distinction:department-name</common:department-name>
+                    <common:role-title>distinction:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>distinction-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-distinction</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>GRID</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </distinction:distinction-summary>
+            </activities:affiliation-group>
+        </activities:distinctions>
+        <activities:educations>
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <education:education-summary put-code="0"
+                                             visibility="private">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>education:department-name</common:department-name>
+                    <common:role-title>education:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>2019</common:year>
+                        <common:month>01</common:month>
+                        <common:day>01</common:day>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>education-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-education</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>GRID</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </education:education-summary>
+            </activities:affiliation-group>
+        </activities:educations>
+        <activities:employments>
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <employment:employment-summary
+                        put-code="0" visibility="private">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>employment:department-name</common:department-name>
+                    <common:role-title>employment:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>2025</common:year>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>employment-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-employment</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>GRID</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </employment:employment-summary>
+            </activities:affiliation-group>
+        </activities:employments>
+        <activities:fundings>
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:group>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>grant_number</common:external-id-type>
+                        <common:external-id-value>external-id-value-1</common:external-id-value>
+                    </common:external-id>
+                </common:external-ids>
+                <funding:funding-summary put-code="0" visibility="private">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <funding:title>
+                        <common:title>common:title</common:title>
+                        <common:translated-title language-code="en">common:translated-title</common:translated-title>
+                    </funding:title>
+                    <common:external-ids>
+                        <common:external-id>
+                            <common:external-id-type>grant_number</common:external-id-type>
+                            <common:external-id-value>external-id-value-1</common:external-id-value>
+                            <common:external-id-url>http://tempuri.org</common:external-id-url>
+                            <common:external-id-relationship>self</common:external-id-relationship>
+                        </common:external-id>
+                    </common:external-ids>
+                    <funding:type>grant</funding:type>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>common:name</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-funding</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>FUNDREF</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </funding:funding-summary>
+            </activities:group>
+        </activities:fundings>
+        <activities:invited-positions path="/8888-8888-8888-8880/invited-positions">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <invited-position:invited-position-summary put-code="0"
+                                                           visibility="private" display-index="0">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>invited-position:department-name</common:department-name>
+                    <common:role-title>invited-position:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>2019</common:year>
+                        <common:month>01</common:month>
+                        <common:day>01</common:day>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>2025</common:year>
+                        <common:month>01</common:month>
+                        <common:day>01</common:day>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>invited-position-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-invited-position</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>GRID</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </invited-position:invited-position-summary>
+            </activities:affiliation-group>
+        </activities:invited-positions>
+        <activities:memberships path="/8888-8888-8888-8880/memberships">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <membership:membership-summary put-code="0"
+                                               visibility="private" display-index="0">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>membership:department-name</common:department-name>
+                    <common:role-title>membership:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:organization>
+                        <common:name>membership-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-membership</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </membership:membership-summary>
+            </activities:affiliation-group>
+        </activities:memberships>
+        <activities:peer-reviews>
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:group>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>something</common:external-id-type>
+                        <common:external-id-value>external-id-value</common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>self</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <activities:peer-review-group>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:external-ids>
+                        <common:external-id>
+                            <common:external-id-type>something</common:external-id-type>
+                            <common:external-id-value>external-id-value
+                            </common:external-id-value>
+                            <common:external-id-url>http://orcid.org</common:external-id-url>
+                            <common:external-id-relationship>self</common:external-id-relationship>
+                        </common:external-id>
+                    </common:external-ids>
+                    <peer-review:peer-review-summary put-code="12345"
+                                                     visibility="private" display-index="0">
+                        <common:created-date>2001-12-31T12:00:00</common:created-date>
+                        <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                        <common:source>
+                            <common:source-client-id>
+                                <common:uri>https://orcid.org/client/APP-9999999999999901</common:uri>
+                                <common:path>APP-9999999999999901</common:path>
+                                <common:host>orcid.org</common:host>
+                            </common:source-client-id>
+                            <common:source-name />
+                        </common:source>
+                        <peer-review:reviewer-role>reviewer</peer-review:reviewer-role>
+                        <common:external-ids>
+                            <common:external-id>
+                                <common:external-id-type>something</common:external-id-type>
+                                <common:external-id-value>external-id-value</common:external-id-value>
+                                <common:external-id-url>http://orcid.org</common:external-id-url>
+                                <common:external-id-relationship>self</common:external-id-relationship>
+                            </common:external-id>
+                        </common:external-ids>
+                        <peer-review:review-url>http://orcid.org</peer-review:review-url>
+                        <peer-review:review-type>review</peer-review:review-type>
+                        <peer-review:completion-date>
+                            <common:year>1948</common:year>
+                            <common:month>02</common:month>
+                            <common:day>02</common:day>
+                        </peer-review:completion-date>
+                        <peer-review:review-group-id>orcid-generated:12345</peer-review:review-group-id>
+                        <peer-review:convening-organization>
+                            <common:name>common:name</common:name>
+                            <common:address>
+                                <common:city>common:city</common:city>
+                                <common:region>common:region</common:region>
+                                <common:country>AF</common:country>
+                            </common:address>
+                            <common:disambiguated-organization>
+                                <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-peer-review</common:disambiguated-organization-identifier>
+                                <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                            </common:disambiguated-organization>
+                        </peer-review:convening-organization>
+                    </peer-review:peer-review-summary>
+                </activities:peer-review-group>
+            </activities:group>
+        </activities:peer-reviews>
+        <activities:qualifications path="/8888-8888-8888-8880/qualifications">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <qualification:qualification-summary put-code="0"
+                                                     visibility="private" display-index="0">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>qualification:department-name</common:department-name>
+                    <common:role-title>qualification:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>2025</common:year>
+                        <common:month>12</common:month>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>qualification-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-qualification</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </qualification:qualification-summary>
+            </activities:affiliation-group>
+        </activities:qualifications>
+        <activities:research-resources path="/8888-8888-8888-8880/research-resources">
+            <common:last-modified-date>2017-01-18T15:06:05.147-06:00</common:last-modified-date>
+            <activities:group>
+                <common:last-modified-date>2017-01-18T15:03:56.856-06:00</common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>proposal_id</common:external-id-type>
+                        <common:external-id-value>123456</common:external-id-value>
+                        <common:external-id-relationship>self</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <research-resource:research-resource-summary
+                        put-code="1234" path="/0000-0003-0902-4386/research-resource/1234" visibility="public">
+                    <!-- common metadata -->
+                    <common:created-date>2015-06-25T16:01:12.718Z</common:created-date>
+                    <common:last-modified-date>2017-09-08T13:31:19.987Z
+                    </common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/0000-0000-0000-0000</common:uri>
+                            <common:path>0000-0000-0000-0000</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name>XSEDE ORCID integration</common:source-name>
+                    </common:source>
+                    <!-- proposal title and host(s) -->
+                    <research-resource:proposal> <!-- proposal/award/credential section -->
+                        <research-resource:title>
+                            <common:title>Giant Laser Award</common:title>
+                        </research-resource:title>
+                        <research-resource:hosts>
+                            <common:organization>
+                                <common:name>XSEDE</common:name>
+                                <common:address>
+                                    <common:city>city</common:city>
+                                    <common:region>region</common:region>
+                                    <common:country>US</common:country>
+                                </common:address>
+                                <common:disambiguated-organization>
+                                    <common:disambiguated-organization-identifier>XX</common:disambiguated-organization-identifier>
+                                    <common:disambiguation-source>grid</common:disambiguation-source>
+                                </common:disambiguated-organization>
+                            </common:organization>
+                        </research-resource:hosts>
+                        <common:external-ids>
+                            <common:external-id>
+                                <common:external-id-type>proposal_id</common:external-id-type>
+                                <common:external-id-value>123456</common:external-id-value>
+                                <common:external-id-relationship>self</common:external-id-relationship>
+                            </common:external-id>
+                        </common:external-ids>
+                        <common:start-date>
+                            <common:year>1999</common:year>
+                            <common:month>02</common:month>
+                            <common:day>02</common:day>
+                        </common:start-date>
+                        <common:end-date>
+                            <common:year>2012</common:year>
+                            <common:month>02</common:month>
+                            <common:day>02</common:day>
+                        </common:end-date>
+                        <common:url>http://xsede.org/GiantLaserAward</common:url>
+                    </research-resource:proposal>
+                </research-resource:research-resource-summary>
+            </activities:group>
+        </activities:research-resources>
+        <activities:services path="/8888-8888-8888-8880/services">
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2001-12-31T12:00:00
+                </common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value
+                        </common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <service:service-summary put-code="0"
+                                         visibility="private" display-index="0">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name />
+                    </common:source>
+                    <common:department-name>service:department-name</common:department-name>
+                    <common:role-title>service:role-title</common:role-title>
+                    <common:start-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:start-date>
+                    <common:organization>
+                        <common:name>service-org</common:name>
+                        <common:address>
+                            <common:city>common:city</common:city>
+                            <common:region>common:region</common:region>
+                            <common:country>AF</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-service</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </service:service-summary>
+            </activities:affiliation-group>
+        </activities:services>
+        <activities:works>
+            <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+            <activities:group>
+                <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                <common:external-ids>
+                    <common:external-id>
+                        <common:external-id-type>agr</common:external-id-type>
+                        <common:external-id-value>external-id-value</common:external-id-value>
+                        <common:external-id-url>http://orcid.org</common:external-id-url>
+                        <common:external-id-relationship>part-of</common:external-id-relationship>
+                    </common:external-id>
+                </common:external-ids>
+                <work:work-summary put-code="0" visibility="private">
+                    <common:created-date>2001-12-31T12:00:00</common:created-date>
+                    <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
+                    <common:source>
+                        <common:source-client-id>
+                            <common:uri>https://orcid.org/client/8888-8888-8888-8880</common:uri>
+                            <common:path>8888-8888-8888-8880</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-client-id>
+                        <common:source-name />
+                    </common:source>
+                    <work:title>
+                        <common:title>common:title</common:title>
+                        <common:subtitle />
+                        <common:translated-title language-code="en">common:translated-title</common:translated-title>
+                    </work:title>
+                    <common:external-ids>
+                        <common:external-id>
+                            <common:external-id-type>agr</common:external-id-type>
+                            <common:external-id-value>external-id-value</common:external-id-value>
+                            <common:external-id-url>http://tempuri.org</common:external-id-url>
+                            <common:external-id-relationship>self</common:external-id-relationship>
+                        </common:external-id>
+                    </common:external-ids>
+                    <work:type>artistic-performance</work:type>
+                    <common:publication-date>
+                        <common:year>1948</common:year>
+                        <common:month>02</common:month>
+                        <common:day>02</common:day>
+                    </common:publication-date>
+                    <work:journal-title>Procedia Computer Science</work:journal-title>
+                </work:work-summary>
+            </activities:group>
+        </activities:works>
+    </activities:activities-summary>
+</record:record>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml
new file mode 100644
index 000000000..43bc96b8c
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-5109-1000_othername.xml
@@ -0,0 +1,196 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<record:record xmlns:address="http://www.orcid.org/ns/address"
+               xmlns:email="http://www.orcid.org/ns/email
+    " xmlns:history="http://www.orcid.org/ns/history"
+               xmlns:employment="http://www.orcid.org/ns/employment"
+               xmlns:education="http://www.orcid.org/ns/education"
+               xmlns:other-name="http://www.orcid.org/ns/other-name"
+               xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+               xmlns:funding="http://www.orcid.org/ns/funding"
+               xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+               xmlns:service="http://www.orcid.org/ns/service"
+               xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+               xmlns:distinction="http://www.orcid.org/ns/distinction"
+               xmlns:internal="http://www.orcid.org/ns/internal"
+               xmlns:membership="http://www.orcid.org/ns/membership"
+               xmlns:person="http://www.orcid.org/ns/person"
+               xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+               xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+               xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+               xmlns:activities="http://www.orcid.org/ns/activities"
+               xmlns:qualification="http://www.orcid.org/ns/qualification"
+               xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+               xmlns:error="http://www.orcid.org/ns/error"
+               xmlns:preferences="http://www.orcid.org/ns/preferences"
+               xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+               xmlns:work="http://www.orcid.org/ns/work"
+               xmlns:peer-review="http://www.orcid.org/ns/peer-review" path="/0000-0001-5109-1000">
+    <common:orcid-identifier>
+        <common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
+        <common:path>0000-0001-5109-1000</common:path>
+        <common:host>orcid.org</common:host>
+    </common:orcid-identifier>
+    <preferences:preferences>
+        <preferences:locale>en</preferences:locale>
+    </preferences:preferences>
+    <history:history>
+        <history:creation-method>Member-referred</history:creation-method>
+        <history:submission-date>2019-05-01T13:04:57.507Z</history:submission-date>
+        <common:last-modified-date>2019-05-01T13:59:54.268Z</common:last-modified-date>
+        <history:claimed>true</history:claimed>
+        <history:verified-email>true</history:verified-email>
+        <history:verified-primary-email>true</history:verified-primary-email>
+    </history:history>
+    <person:person path="/0000-0001-5109-1000/person">
+        <common:last-modified-date>2019-05-01T13:45:47.727Z</common:last-modified-date>
+        <person:name visibility="public" path="0000-0001-5109-1000">
+            <common:created-date>2019-05-01T13:04:57.507Z</common:created-date>
+            <common:last-modified-date>2019-05-01T13:04:57.740Z</common:last-modified-date>
+            <personal-details:given-names>Andrew</personal-details:given-names>
+            <personal-details:family-name>Porteus</personal-details:family-name>
+        </person:name>
+        <other-name:other-names path="/0000-0001-5109-1000/other-names">
+            <common:last-modified-date>2019-05-01T13:44:57.072Z</common:last-modified-date>
+            <other-name:other-name put-code="1238811" visibility="public" path="/0000-0001-5109-1000/other-names/1238811" display-index="1">
+                <common:created-date>2019-05-01T13:44:57.072Z</common:created-date>
+                <common:last-modified-date>2019-05-01T13:44:57.072Z</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
+                        <common:path>0000-0001-5109-1000</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name>Andrew Porteus</common:source-name>
+                </common:source>
+                <other-name:content>Andrew C. Porteus</other-name:content>
+            </other-name:other-name>
+        </other-name:other-names>
+        <person:biography visibility="public" path="/0000-0001-5109-1000/biography">
+            <common:created-date>2019-05-01T13:59:54.263Z</common:created-date>
+            <common:last-modified-date>2019-05-01T13:59:54.263Z</common:last-modified-date>
+            <personal-details:content>Retired Librarian</personal-details:content>
+        </person:biography>
+        <researcher-url:researcher-urls path="/0000-0001-5109-1000/researcher-urls">
+            <common:last-modified-date>2019-05-01T13:45:47.727Z</common:last-modified-date>
+            <researcher-url:researcher-url put-code="1722812" visibility="public" path="/0000-0001-5109-1000/researcher-urls/1722812" display-index="1">
+                <common:created-date>2019-05-01T13:45:47.727Z</common:created-date>
+                <common:last-modified-date>2019-05-01T13:45:47.727Z</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
+                        <common:path>0000-0001-5109-1000</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name>Andrew Porteus</common:source-name>
+                </common:source>
+                <researcher-url:url-name>Niagara Falls Poetry Project</researcher-url:url-name>
+                <researcher-url:url>http://niagarapoetry.ca</researcher-url:url>
+            </researcher-url:researcher-url>
+        </researcher-url:researcher-urls>
+        <email:emails path="/0000-0001-5109-1000/email"/>
+        <address:addresses path="/0000-0001-5109-1000/address">
+            <common:last-modified-date>2019-05-01T13:45:09.764Z</common:last-modified-date>
+            <address:address put-code="1247706" visibility="public" path="/0000-0001-5109-1000/address/1247706" display-index="1">
+                <common:created-date>2019-05-01T13:45:09.764Z</common:created-date>
+                <common:last-modified-date>2019-05-01T13:45:09.764Z</common:last-modified-date>
+                <common:source>
+                    <common:source-orcid>
+                        <common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
+                        <common:path>0000-0001-5109-1000</common:path>
+                        <common:host>orcid.org</common:host>
+                    </common:source-orcid>
+                    <common:source-name>Andrew Porteus</common:source-name>
+                </common:source>
+                <address:country>CA</address:country>
+            </address:address>
+        </address:addresses>
+        <keyword:keywords path="/0000-0001-5109-1000/keywords"/>
+        <external-identifier:external-identifiers path="/0000-0001-5109-1000/external-identifiers"/>
+    </person:person>
+    <activities:activities-summary path="/0000-0001-5109-1000/activities">
+        <common:last-modified-date>2019-05-01T13:57:45.787Z</common:last-modified-date>
+        <activities:distinctions path="/0000-0001-5109-1000/distinctions"/>
+        <activities:educations path="/0000-0001-5109-1000/educations">
+            <common:last-modified-date>2019-05-01T13:15:26.102Z</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2019-05-01T13:15:26.102Z</common:last-modified-date>
+                <common:external-ids/>
+                <education:education-summary put-code="7801952" display-index="1" path="/0000-0001-5109-1000/education/7801952" visibility="public">
+                    <common:created-date>2019-05-01T13:15:26.102Z</common:created-date>
+                    <common:last-modified-date>2019-05-01T13:15:26.102Z</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
+                            <common:path>0000-0001-5109-1000</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name>Andrew Porteus</common:source-name>
+                    </common:source>
+                    <common:role-title>Library Technician Diploma</common:role-title>
+                    <common:start-date>
+                        <common:year>1976</common:year>
+                        <common:month>09</common:month>
+                    </common:start-date>
+                    <common:end-date>
+                        <common:year>1978</common:year>
+                        <common:month>05</common:month>
+                    </common:end-date>
+                    <common:organization>
+                        <common:name>Niagara College</common:name>
+                        <common:address>
+                            <common:city>Welland</common:city>
+                            <common:region>ON</common:region>
+                            <common:country>CA</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>125147</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </education:education-summary>
+            </activities:affiliation-group>
+        </activities:educations>
+        <activities:employments path="/0000-0001-5109-1000/employments"/>
+        <activities:fundings path="/0000-0001-5109-1000/fundings"/>
+        <activities:invited-positions path="/0000-0001-5109-1000/invited-positions"/>
+        <activities:memberships path="/0000-0001-5109-1000/memberships"/>
+        <activities:peer-reviews path="/0000-0001-5109-1000/peer-reviews"/>
+        <activities:qualifications path="/0000-0001-5109-1000/qualifications">
+            <common:last-modified-date>2019-05-01T13:19:49.021Z</common:last-modified-date>
+            <activities:affiliation-group>
+                <common:last-modified-date>2019-05-01T13:19:49.021Z</common:last-modified-date>
+                <common:external-ids/>
+                <qualification:qualification-summary put-code="7801973" display-index="1" path="/0000-0001-5109-1000/qualification/7801973" visibility="public">
+                    <common:created-date>2019-05-01T13:19:49.021Z</common:created-date>
+                    <common:last-modified-date>2019-05-01T13:19:49.021Z</common:last-modified-date>
+                    <common:source>
+                        <common:source-orcid>
+                            <common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
+                            <common:path>0000-0001-5109-1000</common:path>
+                            <common:host>orcid.org</common:host>
+                        </common:source-orcid>
+                        <common:source-name>Andrew Porteus</common:source-name>
+                    </common:source>
+                    <common:department-name>Communication, Film &amp; Popular Culture</common:department-name>
+                    <common:role-title>Master's Candidate</common:role-title>
+                    <common:start-date>
+                        <common:year>2018</common:year>
+                        <common:month>09</common:month>
+                    </common:start-date>
+                    <common:organization>
+                        <common:name>Brock University</common:name>
+                        <common:address>
+                            <common:city>Saint Catharines</common:city>
+                            <common:region>ON</common:region>
+                            <common:country>CA</common:country>
+                        </common:address>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>7497</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </qualification:qualification-summary>
+            </activities:affiliation-group>
+        </activities:qualifications>
+    </activities:activities-summary>
+</record:record>
\ No newline at end of file

From 9a2fa9dc2f45f030ca358621ddbbd51a4be3bf2c Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Fri, 13 Nov 2020 10:25:34 +0100
Subject: [PATCH 28/34] added test for other names parsing from summaries dump

---
 .../orcidnodoi/xml/OrcidNoDoiTest.java        | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
index 948e5b094..1f77197ab 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@@ -13,6 +13,7 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
 import com.ximpleware.NavException;
 import com.ximpleware.ParseException;
 import com.ximpleware.XPathEvalException;
@@ -218,7 +219,7 @@ public class OrcidNoDoiTest {
 			.forEach(c -> {
 				if (am.simpleMatch(c.getCreditName(), author.getName()) ||
 					am.simpleMatch(c.getCreditName(), author.getSurname()) ||
-					am.simpleMatch(c.getCreditName(), author.getOtherName())) {
+					am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
 					matchCounters.set(0, matchCounters.get(0) + 1);
 					c.setSimpleMatch(true);
 				}
@@ -250,7 +251,7 @@ public class OrcidNoDoiTest {
 			.forEach(c -> {
 				if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
 					am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
-					am.simpleMatch(c.getCreditName(), authorX.getOtherName())) {
+					am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
 					int currentCounter = matchCounters2.get(0);
 					currentCounter += 1;
 					matchCounters2.set(0, currentCounter);
@@ -321,4 +322,27 @@ public class OrcidNoDoiTest {
 		assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
 		assertTrue(c.get(0).getOid().equals(orcidIdA));
 	}
+
+	@Test
+	public void otherNamesMatchTest()
+		throws VtdException, ParseException, IOException, XPathEvalException, NavException, XPathParseException {
+
+		AuthorData author = new AuthorData();
+		author.setName("Joe");
+		author.setSurname("Dodge");
+		author.setOid("0000-1111-2222-3333");
+		String otherName1 = new String("Joe Dr. Dodge");
+		String otherName2 = new String("XY");
+		List<String> others = Lists.newArrayList();
+		others.add(otherName1);
+		others.add(otherName2);
+		author.setOtherNames(others);
+		Contributor contributor = new Contributor();
+		contributor.setCreditName("XY");
+		List<Contributor> contributors = Arrays.asList(contributor);
+		AuthorMatcher.match(author, contributors);
+		assertTrue(contributors.get(0).getName().equals("Joe"));
+		assertTrue(contributors.get(0).getSurname().equals("Dodge"));
+		assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
+	}
 }

From 005f849674c93f44e9a3e66b86211dd8f38f8919 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Fri, 13 Nov 2020 12:45:31 +0100
Subject: [PATCH 29/34] added compression to output dataset

---
 .../dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
index cc65b0b4f..a92d534d8 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@@ -128,6 +128,8 @@ public class SparkGenEnrichedOrcidWorks {
 						})
 					.filter(p -> p != null);
 
+				sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
+
 				oafPublicationRDD
 					.mapToPair(
 						p -> new Tuple2<>(p.getClass().toString(),

From c0c2e05eae56c3dad6e111177d88f1959b654d2e Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 17 Nov 2020 18:23:12 +0100
Subject: [PATCH 30/34] added wf to extracting authors and works xml data from
 orcid dump to hdfs; added wf to download the lamda file (containing last
 orcid update informations) from orcid to hdfs

---
 .../orcid/ActivitiesDecompressor.java         |  61 +++++
 .../orcid/ExtractXMLActivitiesData.java       |  54 ++++
 .../orcid/ExtractXMLSummariesData.java        |  56 +++++
 .../doiboost/orcid/SummariesDecompressor.java |  64 +++++
 .../doiboost/orcid/xml/XMLRecordParser.java   |  31 +++
 .../orcid_download/oozie_app/workflow.xml     |  45 ----
 .../oozie_app/workflow.xml                    | 232 ++++++++++++++++++
 .../oozie_app/config-default.xml              |  26 ++
 .../oozie_app/workflow.xml                    |  40 +++
 .../oozie_app/config-default.xml              |   0
 .../oozie_app/workflow.xml                    |  64 +++++
 11 files changed, 628 insertions(+), 45 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml
 rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{orcid_download => orcid_updates_download}/oozie_app/config-default.xml (100%)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
index 02d2b267b..420c363ec 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
@@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.mortbay.log.Log;
 
 import eu.dnetlib.doiboost.orcid.model.WorkData;
@@ -143,4 +144,64 @@ public class ActivitiesDecompressor {
 		Log.info("Error from Orcid found: " + errorFromOrcidFound);
 		Log.info("Error parsing xml work found: " + xmlParserErrorFound);
 	}
+
+	public static void extractXML(Configuration conf, String inputUri, Path outputPath)
+		throws Exception {
+		String uri = inputUri;
+		FileSystem fs = FileSystem.get(URI.create(uri), conf);
+		Path inputPath = new Path(uri);
+		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+		CompressionCodec codec = factory.getCodec(inputPath);
+		if (codec == null) {
+			System.err.println("No codec found for " + uri);
+			System.exit(1);
+		}
+		CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
+		InputStream gzipInputStream = null;
+		try {
+			gzipInputStream = codec.createInputStream(fs.open(inputPath));
+			int counter = 0;
+			try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
+				TarArchiveEntry entry = null;
+				try (SequenceFile.Writer writer = SequenceFile
+					.createWriter(
+						conf,
+						SequenceFile.Writer.file(outputPath),
+						SequenceFile.Writer.keyClass(Text.class),
+						SequenceFile.Writer.valueClass(Text.class),
+						SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
+					while ((entry = tais.getNextTarEntry()) != null) {
+						String filename = entry.getName();
+						if (entry.isDirectory() || !filename.contains("works")) {
+						} else {
+							counter++;
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais));
+							String line;
+							StringBuffer buffer = new StringBuffer();
+							while ((line = br.readLine()) != null) {
+								buffer.append(line);
+							}
+							String xml = buffer.toString();
+							String[] filenameParts = filename.split("/");
+							final Text key = new Text(
+								XMLRecordParser
+									.retrieveOrcidIdFromActivity(
+										xml.getBytes(), filenameParts[filenameParts.length - 1]));
+							final Text value = new Text(xml);
+							writer.append(key, value);
+							if ((counter % 100000) == 0) {
+								Log.info("Current xml works extracted: " + counter);
+							}
+						}
+					}
+				}
+			}
+			Log.info("Activities extraction completed");
+			Log.info("Total XML works parsed: " + counter);
+		} finally {
+			Log.debug("Closing gzip stream");
+			IOUtils.closeStream(gzipInputStream);
+		}
+	}
+
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java
new file mode 100644
index 000000000..c834efa20
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java
@@ -0,0 +1,54 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.mortbay.log.Log;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
+
+public class ExtractXMLActivitiesData extends OrcidDSManager {
+	private String outputWorksPath;
+	private String activitiesFileNameTarGz;
+
+	public static void main(String[] args) throws IOException, Exception {
+		ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData();
+		extractXMLActivitiesData.loadArgs(args);
+		extractXMLActivitiesData.extractWorks();
+	}
+
+	private void loadArgs(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenOrcidAuthorWork.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
+		parser.parseArgument(args);
+
+		hdfsServerUri = parser.get("hdfsServerUri");
+		Log.info("HDFS URI: " + hdfsServerUri);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
+		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
+		Log.info("Activities File Name: " + activitiesFileNameTarGz);
+		outputWorksPath = parser.get("outputWorksPath");
+		Log.info("Output Author Work Data: " + outputWorksPath);
+	}
+
+	private void extractWorks() throws Exception {
+		Configuration conf = initConfigurationObject();
+		FileSystem fs = initFileSystemObject(conf);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
+		Path outputPath = new Path(
+			hdfsServerUri
+				.concat(workingPath)
+				.concat(outputWorksPath));
+		ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath);
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java
new file mode 100644
index 000000000..843889108
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.mortbay.log.Log;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
+
+public class ExtractXMLSummariesData extends OrcidDSManager {
+
+	private String outputAuthorsPath;
+	private String summariesFileNameTarGz;
+
+	public static void main(String[] args) throws IOException, Exception {
+		ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData();
+		extractXMLSummariesData.loadArgs(args);
+		extractXMLSummariesData.extractAuthors();
+	}
+
+	private void loadArgs(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenOrcidAuthorWork.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
+		parser.parseArgument(args);
+
+		hdfsServerUri = parser.get("hdfsServerUri");
+		Log.info("HDFS URI: " + hdfsServerUri);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
+		summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
+		Log.info("Summaries File Name: " + summariesFileNameTarGz);
+		outputAuthorsPath = parser.get("outputAuthorsPath");
+		Log.info("Output Authors Data: " + outputAuthorsPath);
+	}
+
+	public void extractAuthors() throws Exception {
+		Configuration conf = initConfigurationObject();
+		FileSystem fs = initFileSystemObject(conf);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
+		Path outputPath = new Path(
+			hdfsServerUri
+				.concat(workingPath)
+				.concat(outputAuthorsPath)
+				.concat("xml_authors.seq"));
+		SummariesDecompressor.extractXML(conf, tarGzUri, outputPath);
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
index d1b2a1d73..c16899977 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
@@ -17,6 +17,7 @@ import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.mortbay.log.Log;
 
 import eu.dnetlib.dhp.schema.orcid.AuthorData;
@@ -160,4 +161,67 @@ public class SummariesDecompressor {
 		Log.info("Error from Orcid found: " + errorFromOrcidFound);
 		Log.info("Error parsing xml record found: " + xmlParserErrorFound);
 	}
+
+	public static void extractXML(Configuration conf, String inputUri, Path outputPath)
+		throws Exception {
+		String uri = inputUri;
+		FileSystem fs = FileSystem.get(URI.create(uri), conf);
+		Path inputPath = new Path(uri);
+		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+		CompressionCodec codec = factory.getCodec(inputPath);
+		if (codec == null) {
+			System.err.println("No codec found for " + uri);
+			System.exit(1);
+		}
+		CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
+		InputStream gzipInputStream = null;
+		try {
+			gzipInputStream = codec.createInputStream(fs.open(inputPath));
+			int counter = 0;
+			try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
+				TarArchiveEntry entry = null;
+				CompressionCodec Codec = new GzipCodec();
+				org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer
+					.compression(SequenceFile.CompressionType.RECORD, Codec);
+				try (SequenceFile.Writer writer = SequenceFile
+					.createWriter(
+						conf,
+						SequenceFile.Writer.file(outputPath),
+						SequenceFile.Writer.keyClass(Text.class),
+						SequenceFile.Writer.valueClass(Text.class), optCom)) {
+					while ((entry = tais.getNextTarEntry()) != null) {
+						String filename = entry.getName();
+						if (entry.isDirectory()) {
+							Log.debug("Directory entry name: " + entry.getName());
+						} else {
+							Log.debug("XML record entry name: " + entry.getName());
+							counter++;
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais));
+							String line;
+							StringBuffer buffer = new StringBuffer();
+							while ((line = br.readLine()) != null) {
+								buffer.append(line);
+							}
+							String xml = buffer.toString();
+							final Text key = new Text(
+								XMLRecordParser
+									.retrieveOrcidIdFromSummary(
+										xml.getBytes(), filename.split("/")[2].substring(0, 19)));
+							final Text value = new Text(xml);
+							writer.append(key, value);
+						}
+						if ((counter % 100000) == 0) {
+							Log.info("Current xml records extracted: " + counter);
+						}
+					}
+				}
+			}
+			Log.info("Summaries extract completed");
+			Log.info("Total XML records parsed: " + counter);
+
+		} finally {
+			Log.debug("Closing gzip stream");
+			IOUtils.closeStream(gzipInputStream);
+		}
+	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
index a807cf132..cc9abb621 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
@@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml;
 import java.util.Arrays;
 import java.util.List;
 
+import org.mortbay.log.Log;
+
 import com.ximpleware.AutoPilot;
 import com.ximpleware.EOFException;
 import com.ximpleware.EncodingException;
@@ -126,4 +128,33 @@ public class XMLRecordParser {
 		}
 		return workData;
 	}
+
+	public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue)
+		throws VtdException, ParseException {
+		return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1);
+	}
+
+	public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue)
+		throws VtdException, ParseException {
+		return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
+	}
+
+	private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
+		String idAttributeName)
+		throws VtdException, ParseException {
+		final VTDGen vg = new VTDGen();
+		vg.setDoc(bytes);
+		vg.parse(true);
+		final VTDNav vn = vg.getNav();
+		final AutoPilot ap = new AutoPilot(vn);
+		ap.declareXPathNameSpace(ns, nsUrl);
+		List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
+			.getTextValuesWithAttributes(
+				ap, vn, xpath, Arrays.asList(idAttributeName));
+		if (!recordNodes.isEmpty()) {
+			return (recordNodes.get(0).getAttributes().get(idAttributeName));
+		}
+		Log.info("id not found - default: " + defaultValue);
+		return defaultValue;
+	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml
deleted file mode 100644
index 1f9adeb4d..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/workflow.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<workflow-app name="Orcid Download" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPathOrcid</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>token</name>
-            <description>access token</description>
-        </property>
-    </parameters>
-    
-    <start to="ResetWorkingPath"/>
-    
-    
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-    
-    <action name="ResetWorkingPath">
-        <fs>
-            <delete path='${workingPathOrcid}/download'/>
-            <mkdir path='${workingPathOrcid}/download'/>
-        </fs>
-        <ok to="DownloadOrcidData"/>
-        <error to="Kill"/>
-    </action>
-	
-	<action name="DownloadOrcidData">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
-            <arg>-d</arg><arg>${workingPathOrcid}/</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>last_modified.csv</arg>
-            <arg>-o</arg><arg>download/</arg>
-            <arg>-t</arg><arg>${token}</arg>
-        </java>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-    
-   <end name="End"/>
-</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml
new file mode 100644
index 000000000..6f629c754
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml
@@ -0,0 +1,232 @@
+<workflow-app name="Extract Orcid XML Works From Activities" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>oozie.action.sharelib.for.java</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                <value>true</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapreduce.map.java.opts</name>
+                <value>-Xmx2g</value>
+            </property>
+            <property>
+                <name>oozie.use.system.libpath</name>
+                <value>true</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="ResetWorkingPath"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath}/xml/works'/>
+            <mkdir path='${workingPath}/xml/works'/>
+        </fs>
+        <ok to="fork_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <fork name = "fork_node">
+        <path start = "ExtractXMLWorkActivities_0"/>
+        <path start = "ExtractXMLWorkActivities_1"/>
+        <path start = "ExtractXMLWorkActivities_2"/>
+        <path start = "ExtractXMLWorkActivities_3"/>
+        <path start = "ExtractXMLWorkActivities_4"/>
+        <path start = "ExtractXMLWorkActivities_5"/>
+        <path start = "ExtractXMLWorkActivities_6"/>
+        <path start = "ExtractXMLWorkActivities_7"/>
+        <path start = "ExtractXMLWorkActivities_8"/>
+        <path start = "ExtractXMLWorkActivities_9"/>
+        <path start = "ExtractXMLWorkActivities_X"/>
+    </fork>
+    
+    <action name="ExtractXMLWorkActivities_0">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_0.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_1">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_1.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_2">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_2.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_3">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_3.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_4">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_4.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_5">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_5.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="ExtractXMLWorkActivities_6">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_6.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_7">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_7.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="ExtractXMLWorkActivities_8">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_8.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_9">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_9.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLWorkActivities_X">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
+            <arg>-ow</arg><arg>xml/works/xml_works_X.seq</arg>
+            <arg>-oew</arg><arg>---</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name = "join_node" to = "End"/>
+
+    <end name="End"/>
+</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml
new file mode 100644
index 000000000..191654378
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml
@@ -0,0 +1,26 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.map.java.opts</name>
+        <value>-Xmx8g</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml
new file mode 100644
index 000000000..68d468ab3
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml
@@ -0,0 +1,40 @@
+<workflow-app name="Extract Orcid XML Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+    </parameters>
+
+    <start to="ResetWorkingPath"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath}/xml/authors'/>
+            <mkdir path='${workingPath}/xml/authors'/>
+        </fs>
+        <ok to="ExtractXMLAuthorsSummaries"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractXMLAuthorsSummaries">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData</main-class>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
+            <arg>-o</arg><arg>xml/authors/</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/config-default.xml
similarity index 100%
rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_download/oozie_app/config-default.xml
rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/config-default.xml
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
new file mode 100644
index 000000000..a3daab116
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
@@ -0,0 +1,64 @@
+<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>token</name>
+            <description>access token</description>
+        </property>
+        <property>
+            <name>shell_cmd</name>
+            <value>wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar
+            </value>
+            <description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
+        </property>
+    </parameters>
+    
+    <start to="ResetWorkingPath"/>
+    
+    
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath}/downloads'/>
+            <mkdir path='${workingPath}/downloads'/>
+        </fs>
+        <ok to="DownloadLambdaFile"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="DownloadLambdaFile">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>bash</exec>
+            <argument>-c</argument>
+            <argument>${shell_cmd}</argument>
+            <capture-output/>
+        </shell>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+	<action name="DownloadOrcidData">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
+            <arg>-d</arg><arg>${workingPathOrcid}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>last_modified.csv</arg>
+            <arg>-o</arg><arg>download/</arg>
+            <arg>-t</arg><arg>${token}</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    
+   <end name="End"/>
+</workflow-app>
\ No newline at end of file

From 97c8111847a148fb738c593136d16934c6be15cf Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Mon, 23 Nov 2020 09:49:22 +0100
Subject: [PATCH 31/34] action to convert lambda file in seq file; spark action
 to download updated authors

---
 .../doiboost/orcid/OrcidDownloader.java       | 185 +++++++++---------
 .../orcid/SparkDownloadOrcidAuthors.java      | 166 ++++++++++++++++
 .../orcid/SparkGenLastModifiedSeq.java        |  99 ++++++++++
 .../orcid/SparkOrcidGenerateAuthors.java      | 165 ----------------
 .../orcid/SparkPartitionLambdaFile.java       |  50 -----
 .../orcid/model/DownloadedRecordData.java     |  14 +-
 .../gen_orcid_authors_parameters.json         |   4 -
 .../oozie_app/config-default.xml              |  22 ---
 .../orcid_gen_authors/oozie_app/workflow.xml  |  83 --------
 .../oozie_app/workflow.xml                    | 122 +++++++++++-
 .../doiboost/orcid/OrcidClientTest.java       | 139 +++++++++++--
 .../0000-0001-6645-509X.compressed.base64     |   1 -
 .../0000-0003-3028-6161.compressed.base64     |   1 +
 13 files changed, 608 insertions(+), 443 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml
 delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml
 delete mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
index 762d8aecd..be727ab9f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
@@ -1,14 +1,15 @@
 
 package eu.dnetlib.doiboost.orcid;
 
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.*;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
 
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@@ -16,6 +17,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
@@ -27,10 +29,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class OrcidDownloader extends OrcidDSManager {
 
 	static final int REQ_LIMIT = 24;
-//	static final int REQ_MAX_TEST = 100;
-	static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000;
+	static final int REQ_MAX_TEST = -1;
+	static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
 	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
-	static final String lastUpdate = "2019-09-30 00:00:00";
+	static final String lastUpdate = "2020-09-29 00:00:00";
 	private String lambdaFileName;
 	private String outputPath;
 	private String token;
@@ -41,7 +43,7 @@ public class OrcidDownloader extends OrcidDSManager {
 		orcidDownloader.parseLambdaFile();
 	}
 
-	private String downloadRecord(String orcidId) {
+	private String downloadRecord(String orcidId) throws IOException {
 		try (CloseableHttpClient client = HttpClients.createDefault()) {
 			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
 			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
@@ -49,17 +51,23 @@ public class OrcidDownloader extends OrcidDSManager {
 			CloseableHttpResponse response = client.execute(httpGet);
 			if (response.getStatusLine().getStatusCode() != 200) {
 				Log
-					.warn(
+					.info(
 						"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
 				return new String("");
 			}
-			return IOUtils.toString(response.getEntity().getContent());
-
-		} catch (Throwable e) {
-			Log.warn("Downloading " + orcidId, e.getMessage());
-
+//			return IOUtils.toString(response.getEntity().getContent());
+			return xmlStreamToString(response.getEntity().getContent());
 		}
-		return new String("");
+	}
+
+	private String xmlStreamToString(InputStream xmlStream) throws IOException {
+		BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
+		String line;
+		StringBuffer buffer = new StringBuffer();
+		while ((line = br.readLine()) != null) {
+			buffer.append(line);
+		}
+		return buffer.toString();
 	}
 
 	public void parseLambdaFile() throws Exception {
@@ -76,90 +84,87 @@ public class OrcidDownloader extends OrcidDSManager {
 			hdfsServerUri
 				.concat(workingPath)
 				.concat(outputPath)
-				.concat("orcid_records.seq"));
-
-		try (SequenceFile.Writer writer = SequenceFile
-			.createWriter(
-				conf,
-				SequenceFile.Writer.file(hdfsoutputPath),
-				SequenceFile.Writer.keyClass(Text.class),
-				SequenceFile.Writer.valueClass(Text.class))) {
-
-			try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) {
-				String line;
-				int nReqTmp = 0;
+				.concat("updated_xml_authors.seq"));
+		try (TarArchiveInputStream tais = new TarArchiveInputStream(
+			new GzipCompressorInputStream(lambdaFileStream))) {
+			TarArchiveEntry entry = null;
+			StringBuilder sb = new StringBuilder();
+			try (SequenceFile.Writer writer = SequenceFile
+				.createWriter(
+					conf,
+					SequenceFile.Writer.file(hdfsoutputPath),
+					SequenceFile.Writer.keyClass(Text.class),
+					SequenceFile.Writer.valueClass(Text.class),
+					SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
 				startDownload = System.currentTimeMillis();
-				long startReqTmp = System.currentTimeMillis();
-				while ((line = br.readLine()) != null) {
-					parsedRecordsCounter++;
-					// skip headers line
-					if (parsedRecordsCounter == 1) {
-						continue;
-					}
-					String[] values = line.split(",");
-					List<String> recordInfo = Arrays.asList(values);
-					String orcidId = recordInfo.get(0);
-					if (isModified(orcidId, recordInfo.get(3))) {
-						String record = downloadRecord(orcidId);
-						downloadedRecordsCounter++;
-						if (!record.isEmpty()) {
-							String compressRecord = ArgumentApplicationParser.compressArgument(record);
-							final Text key = new Text(recordInfo.get(0));
-							final Text value = new Text(compressRecord);
-
-							try {
+				while ((entry = tais.getNextTarEntry()) != null) {
+					BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
+					String line;
+					while ((line = br.readLine()) != null) {
+						String[] values = line.split(",");
+						List<String> recordInfo = Arrays.asList(values);
+						int nReqTmp = 0;
+						long startReqTmp = System.currentTimeMillis();
+						// skip headers line
+						if (parsedRecordsCounter == 0) {
+							parsedRecordsCounter++;
+							continue;
+						}
+						parsedRecordsCounter++;
+						String orcidId = recordInfo.get(0);
+						if (isModified(orcidId, recordInfo.get(3))) {
+							String record = downloadRecord(orcidId);
+							downloadedRecordsCounter++;
+							if (!record.isEmpty()) {
+//							String compressRecord = ArgumentApplicationParser.compressArgument(record);
+								final Text key = new Text(recordInfo.get(0));
+								final Text value = new Text(record);
 								writer.append(key, value);
 								savedRecordsCounter++;
-							} catch (IOException e) {
-								Log.warn("Writing to sequence file: " + e.getMessage());
-								Log.warn(e);
-								throw new RuntimeException(e);
+							}
+						} else {
+							break;
+						}
+						long endReq = System.currentTimeMillis();
+						nReqTmp++;
+						if (nReqTmp == REQ_LIMIT) {
+							long reqSessionDuration = endReq - startReqTmp;
+							if (reqSessionDuration <= 1000) {
+								Log
+									.info(
+										"\nreqSessionDuration: "
+											+ reqSessionDuration
+											+ " nReqTmp: "
+											+ nReqTmp
+											+ " wait ....");
+								Thread.sleep(1000 - reqSessionDuration);
+							} else {
+								nReqTmp = 0;
+								startReqTmp = System.currentTimeMillis();
+							}
+						}
+						if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
+							Log
+								.info(
+									"Current parsed: "
+										+ parsedRecordsCounter
+										+ " downloaded: "
+										+ downloadedRecordsCounter
+										+ " saved: "
+										+ savedRecordsCounter);
+							if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
+								break;
 							}
 						}
 					}
-					long endReq = System.currentTimeMillis();
-					nReqTmp++;
-					if (nReqTmp == REQ_LIMIT) {
-						long reqSessionDuration = endReq - startReqTmp;
-						if (reqSessionDuration <= 1000) {
-							Log
-								.warn(
-									"\nreqSessionDuration: "
-										+ reqSessionDuration
-										+ " nReqTmp: "
-										+ nReqTmp
-										+ " wait ....");
-							Thread.sleep(1000 - reqSessionDuration);
-						} else {
-							nReqTmp = 0;
-							startReqTmp = System.currentTimeMillis();
-						}
-					}
-
-//					if (parsedRecordsCounter > REQ_MAX_TEST) {
-//						break;
-//					}
-					if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
-						Log
-							.info(
-								"Current parsed: "
-									+ parsedRecordsCounter
-									+ " downloaded: "
-									+ downloadedRecordsCounter
-									+ " saved: "
-									+ savedRecordsCounter);
-//						if (parsedRecordsCounter > REQ_MAX_TEST) {
-//							break;
-//						}
-					}
+					long endDownload = System.currentTimeMillis();
+					long downloadTime = endDownload - startDownload;
+					Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
 				}
-				long endDownload = System.currentTimeMillis();
-				long downloadTime = endDownload - startDownload;
-				Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
 			}
 		}
-		lambdaFileStream.close();
 		Log.info("Download started at: " + new Date(startDownload).toString());
+		Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
 		Log.info("Parsed Records Counter: " + parsedRecordsCounter);
 		Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
 		Log.info("Saved Records Counter: " + savedRecordsCounter);
@@ -185,7 +190,7 @@ public class OrcidDownloader extends OrcidDSManager {
 		token = parser.get("token");
 	}
 
-	private boolean isModified(String orcidId, String modifiedDate) {
+	public boolean isModified(String orcidId, String modifiedDate) {
 		Date modifiedDateDt = null;
 		Date lastUpdateDt = null;
 		try {
@@ -195,7 +200,7 @@ public class OrcidDownloader extends OrcidDSManager {
 			modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
 			lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
 		} catch (Exception e) {
-			Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
+			Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
 			return true;
 		}
 		return modifiedDateDt.after(lastUpdateDt);
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
new file mode 100644
index 000000000..850a654d4
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
@@ -0,0 +1,166 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.util.LongAccumulator;
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
+import scala.Tuple2;
+
+public class SparkDownloadOrcidAuthors {
+
+	static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
+	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
+	static final String lastUpdate = "2020-09-29 00:00:00";
+
+	public static void main(String[] args) throws IOException, Exception {
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkDownloadOrcidAuthors.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+		final String workingPath = parser.get("workingPath");
+		logger.info("workingPath: ", workingPath);
+		final String outputPath = parser.get("outputPath");
+		logger.info("outputPath: ", outputPath);
+		final String token = parser.get("token");
+		final String lambdaFileName = parser.get("lambdaFileName");
+		logger.info("lambdaFileName: ", lambdaFileName);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+				LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsedRecords");
+				LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("modifiedRecords");
+				LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloadedRecords");
+
+				logger.info("Retrieving data from lamda sequence file");
+				JavaPairRDD<Text, Text> lamdaFileRDD = sc
+					.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
+				logger.info("Data retrieved: " + lamdaFileRDD.count());
+
+				Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
+					String orcidId = data._1().toString();
+					String lastModifiedDate = data._2().toString();
+					parsedRecordsAcc.add(1);
+					if (isModified(orcidId, lastModifiedDate)) {
+						modifiedRecordsAcc.add(1);
+						return true;
+					}
+					return false;
+				};
+
+				Function<Tuple2<Text, Text>, Tuple2<String, String>> downloadRecordFunction = data -> {
+					String orcidId = data._1().toString();
+					String lastModifiedDate = data._2().toString();
+					final DownloadedRecordData downloaded = new DownloadedRecordData();
+					downloaded.setOrcidId(orcidId);
+					downloaded.setLastModifiedDate(lastModifiedDate);
+					try (CloseableHttpClient client = HttpClients.createDefault()) {
+						HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
+						httpGet.addHeader("Accept", "application/vnd.orcid+xml");
+						httpGet.addHeader("Authorization", String.format("Bearer %s", token));
+						CloseableHttpResponse response = client.execute(httpGet);
+						int statusCode = response.getStatusLine().getStatusCode();
+						downloaded.setStatusCode(statusCode);
+						if (statusCode != 200) {
+							logger
+								.info(
+									"Downloading " + orcidId + " status code: "
+										+ response.getStatusLine().getStatusCode());
+							return downloaded.toTuple2();
+						}
+						downloadedRecordsAcc.add(1);
+						long currentDownloaded = downloadedRecordsAcc.value();
+						if ((currentDownloaded % 10000) == 0) {
+							logger.info("Current downloaded: " + currentDownloaded);
+						}
+						downloaded
+							.setCompressedData(
+								ArgumentApplicationParser
+									.compressArgument(IOUtils.toString(response.getEntity().getContent())));
+					} catch (Throwable e) {
+						logger.info("Downloading " + orcidId, e.getMessage());
+						downloaded.setErrorMessage(e.getMessage());
+						return downloaded.toTuple2();
+					}
+					return downloaded.toTuple2();
+				};
+
+				sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
+
+				logger.info("Start execution ...");
+//				List<Tuple2<Text, Text>> sampleList = lamdaFileRDD.take(500);
+//				JavaRDD<Tuple2<Text, Text>> sampleRDD = sc.parallelize(sampleList);
+//				sampleRDD
+				JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD
+					.filter(isModifiedAfterFilter);
+				logger.info("Authors modified count: " + authorsModifiedRDD.count());
+				logger.info("Start downloading ...");
+				authorsModifiedRDD
+					.map(downloadRecordFunction)
+					.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
+					.saveAsNewAPIHadoopFile(
+						workingPath.concat(outputPath),
+						Text.class,
+						Text.class,
+						SequenceFileOutputFormat.class,
+						sc.hadoopConfiguration());
+				logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
+				logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
+				logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
+			});
+
+	}
+
+	private static boolean isModified(String orcidId, String modifiedDate) {
+		Date modifiedDateDt = null;
+		Date lastUpdateDt = null;
+		try {
+			if (modifiedDate.length() != 19) {
+				modifiedDate = modifiedDate.substring(0, 19);
+			}
+			modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
+			lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
+		} catch (Exception e) {
+			logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
+			return true;
+		}
+		return modifiedDateDt.after(lastUpdateDt);
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java
new file mode 100644
index 000000000..f710635ab
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java
@@ -0,0 +1,99 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.mortbay.log.Log;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class SparkGenLastModifiedSeq {
+	private static String hdfsServerUri;
+	private static String workingPath;
+	private static String outputPath;
+	private static String lambdaFileName;
+
+	public static void main(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkGenLastModifiedSeq.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		hdfsServerUri = parser.get("hdfsServerUri");
+		workingPath = parser.get("workingPath");
+		outputPath = parser.get("outputPath");
+		lambdaFileName = parser.get("lambdaFileName");
+		String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
+
+		SparkConf sparkConf = new SparkConf();
+		runWithSparkSession(
+			sparkConf,
+			isSparkSessionManaged,
+			spark -> {
+				int rowsNum = 0;
+				Path output = new Path(
+					hdfsServerUri
+						.concat(workingPath)
+						.concat(outputPath));
+				Path hdfsreadpath = new Path(lambdaFileUri);
+				Configuration conf = new Configuration();
+				conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
+				conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+				conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+				FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
+				FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
+				try (TarArchiveInputStream tais = new TarArchiveInputStream(
+					new GzipCompressorInputStream(lambdaFileStream))) {
+					TarArchiveEntry entry = null;
+					try (SequenceFile.Writer writer = SequenceFile
+						.createWriter(
+							conf,
+							SequenceFile.Writer.file(output),
+							SequenceFile.Writer.keyClass(Text.class),
+							SequenceFile.Writer.valueClass(Text.class),
+							SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
+						while ((entry = tais.getNextTarEntry()) != null) {
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais));
+							String line;
+							while ((line = br.readLine()) != null) {
+								String[] values = line.split(",");
+								List<String> recordInfo = Arrays.asList(values);
+								String orcidId = recordInfo.get(0);
+								final Text key = new Text(orcidId);
+								final Text value = new Text(recordInfo.get(3));
+								writer.append(key, value);
+								rowsNum++;
+							}
+						}
+					}
+				}
+				Log.info("Saved rows from lamda csv tar file: " + rowsNum);
+			});
+	}
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java
deleted file mode 100644
index 4e18ab840..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java
+++ /dev/null
@@ -1,165 +0,0 @@
-
-package eu.dnetlib.doiboost.orcid;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.List;
-import java.util.Optional;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.impl.client.HttpClients;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.util.LongAccumulator;
-import org.mortbay.log.Log;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
-import scala.Tuple2;
-
-public class SparkOrcidGenerateAuthors {
-
-	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
-	static final String lastUpdate = "2019-09-30 00:00:00";
-
-	public static void main(String[] args) throws IOException, Exception {
-		Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
-		logger.info("[ SparkOrcidGenerateAuthors STARTED]");
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkOrcidGenerateAuthors.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
-		parser.parseArgument(args);
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
-		logger.info("workingPath: ", workingPath);
-		final String outputAuthorsPath = parser.get("outputAuthorsPath");
-		logger.info("outputAuthorsPath: ", outputAuthorsPath);
-		final String token = parser.get("token");
-
-		SparkConf conf = new SparkConf();
-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-
-				LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords");
-				LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords");
-				LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords");
-				LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords");
-				JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "lamdafiles");
-
-				JavaRDD<String> downloadedRDD = sc.textFile(workingPath + "downloaded");
-				Function<String, String> getOrcidIdFunction = line -> {
-					try {
-						String[] values = line.split(",");
-						return values[0].substring(1);
-					} catch (Exception e) {
-						return new String("");
-					}
-				};
-				List<String> downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect();
-
-				Function<String, Boolean> isModifiedAfterFilter = line -> {
-					String[] values = line.split(",");
-					String orcidId = values[0];
-					parsedRecordsAcc.add(1);
-					if (isModified(orcidId, values[3])) {
-						modifiedRecordsAcc.add(1);
-						return true;
-					}
-					return false;
-				};
-				Function<String, Boolean> isNotDownloadedFilter = line -> {
-					String[] values = line.split(",");
-					String orcidId = values[0];
-					if (downloadedRecords.contains(orcidId)) {
-						alreadyDownloadedRecords.add(1);
-						return false;
-					}
-					return true;
-				};
-				Function<String, Tuple2<String, String>> downloadRecordFunction = line -> {
-					String[] values = line.split(",");
-					String orcidId = values[0];
-					String modifiedDate = values[3];
-					return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc);
-				};
-
-				lamdaFileRDD
-					.filter(isModifiedAfterFilter)
-					.filter(isNotDownloadedFilter)
-					.map(downloadRecordFunction)
-					.rdd()
-					.saveAsTextFile(workingPath.concat(outputAuthorsPath));
-			});
-
-	}
-
-	private static boolean isModified(String orcidId, String modifiedDate) {
-		Date modifiedDateDt = null;
-		Date lastUpdateDt = null;
-		try {
-			if (modifiedDate.length() != 19) {
-				modifiedDate = modifiedDate.substring(0, 19);
-			}
-			modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
-			lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
-		} catch (Exception e) {
-			Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
-			return true;
-		}
-		return modifiedDateDt.after(lastUpdateDt);
-	}
-
-	private static Tuple2<String, String> downloadRecord(String orcidId, String modifiedDate, String token,
-		LongAccumulator downloadedRecordsAcc) {
-		final DownloadedRecordData data = new DownloadedRecordData();
-		data.setOrcidId(orcidId);
-		data.setModifiedDate(modifiedDate);
-		try (CloseableHttpClient client = HttpClients.createDefault()) {
-			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
-			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
-			httpGet.addHeader("Authorization", String.format("Bearer %s", token));
-			CloseableHttpResponse response = client.execute(httpGet);
-			int statusCode = response.getStatusLine().getStatusCode();
-			data.setStatusCode(statusCode);
-			if (statusCode != 200) {
-				Log
-					.warn(
-						"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
-				return data.toTuple2();
-			}
-			downloadedRecordsAcc.add(1);
-			data
-				.setCompressedData(
-					ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent())));
-		} catch (Throwable e) {
-			Log.warn("Downloading " + orcidId, e.getMessage());
-			data.setErrorMessage(e.getMessage());
-			return data.toTuple2();
-		}
-		return data.toTuple2();
-	}
-}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java
deleted file mode 100644
index ca6f0f6c4..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPartitionLambdaFile.java
+++ /dev/null
@@ -1,50 +0,0 @@
-
-package eu.dnetlib.doiboost.orcid;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
-import java.io.IOException;
-import java.util.Optional;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-
-public class SparkPartitionLambdaFile {
-
-	public static void main(String[] args) throws IOException, Exception {
-		Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkOrcidGenerateAuthors.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
-		parser.parseArgument(args);
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-		final String workingPath = parser.get("workingPath");
-
-		SparkConf conf = new SparkConf();
-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-				JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv");
-
-				lamdaFileRDD
-					.repartition(20)
-					.saveAsTextFile(workingPath.concat("lamdafiles"));
-			});
-	}
-
-}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java
index f66ef82a2..da1a79b19 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/DownloadedRecordData.java
@@ -3,8 +3,6 @@ package eu.dnetlib.doiboost.orcid.model;
 
 import java.io.Serializable;
 
-import org.apache.hadoop.io.Text;
-
 import com.google.gson.JsonObject;
 
 import scala.Tuple2;
@@ -12,7 +10,7 @@ import scala.Tuple2;
 public class DownloadedRecordData implements Serializable {
 
 	private String orcidId;
-	private String modifiedDate;
+	private String lastModifiedDate;
 	private String statusCode;
 	private String compressedData;
 	private String errorMessage;
@@ -20,7 +18,7 @@ public class DownloadedRecordData implements Serializable {
 	public Tuple2<String, String> toTuple2() {
 		JsonObject data = new JsonObject();
 		data.addProperty("statusCode", getStatusCode());
-		data.addProperty("modifiedDate", getModifiedDate());
+		data.addProperty("lastModifiedDate", getLastModifiedDate());
 		if (getCompressedData() != null) {
 			data.addProperty("compressedData", getCompressedData());
 		}
@@ -66,11 +64,11 @@ public class DownloadedRecordData implements Serializable {
 		this.compressedData = compressedData;
 	}
 
-	public String getModifiedDate() {
-		return modifiedDate;
+	public String getLastModifiedDate() {
+		return lastModifiedDate;
 	}
 
-	public void setModifiedDate(String modifiedDate) {
-		this.modifiedDate = modifiedDate;
+	public void setLastModifiedDate(String lastModifiedDate) {
+		this.lastModifiedDate = lastModifiedDate;
 	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json
deleted file mode 100644
index 35bfe1b41..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json
+++ /dev/null
@@ -1,4 +0,0 @@
-[{"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the working path",	"paramRequired": true},
- {"paramName":"t",   "paramLongName":"token",	"paramDescription": "token to grant access",	"paramRequired": true},
- {"paramName":"o",   "paramLongName":"outputAuthorsPath",	"paramDescription": "the relative folder of the sequencial file to write the authors data",	"paramRequired": true}
-]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml
deleted file mode 100644
index a720e7592..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<configuration>
-    <property>
-            <name>jobTracker</name>
-            <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
-    </property>
-    <property>
-            <name>nameNode</name>
-            <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
-    </property>
-    <property>
-            <name>queueName</name>
-            <value>default</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-</configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml
deleted file mode 100644
index 7ebc5f0a0..000000000
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml
+++ /dev/null
@@ -1,83 +0,0 @@
-<workflow-app name="Gen Orcid Authors" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>token</name>
-            <description>access token</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-        <property>
-            <name>outputPath</name>
-            <description>the working dir base path</description>
-        </property>
-    </parameters>
-    
-    <start to="ResetWorkingPath"/>
-    
-    
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-    
-    <action name="ResetWorkingPath">
-        <fs>
-            <delete path='${workingPath_activities}/authors'/>
-        </fs>
-        <ok to="Gen_Orcid_Authors"/>
-        <error to="Kill"/>
-    </action>
-	
-	<action name="Split_Lambda_File">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-        	<job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Split_Lambda_File</name>
-            <class>eu.dnetlib.doiboost.orcid.SparkPartitionLambdaFile</class>
-            <jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
-            <spark-opts>--num-executors 24 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
-            </spark-opts>
-            <arg>-w</arg><arg>${workingPath}/</arg>
-            <arg>-o</arg><arg>authors/</arg>
-            <arg>-t</arg><arg>${token}</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-    
-	<action name="Gen_Orcid_Authors">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-        	<job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Gen_Orcid_Authors</name>
-            <class>eu.dnetlib.doiboost.orcid.SparkOrcidGenerateAuthors</class>
-            <jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
-            <spark-opts>--num-executors 20 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
-            </spark-opts>
-            <arg>-w</arg><arg>${workingPath}/</arg>
-            <arg>-o</arg><arg>authors/</arg>
-            <arg>-t</arg><arg>${token}</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-    
-   <end name="End"/>
-</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
index a3daab116..5f728d35b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
@@ -14,9 +14,63 @@
             </value>
             <description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
         </property>
+        <property>
+            <name>sparkExecutorNumber</name>
+            <value>20</value>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <value>7G</value>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <value>2G</value>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <value>1</value>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>spark2MaxExecutors</name>
+            <value>20</value>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
     </parameters>
-    
-    <start to="ResetWorkingPath"/>
+
+    <global>
+        <configuration>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="DownloadOrcidAuthors"/>
     
     
     <kill name="Kill">
@@ -26,6 +80,7 @@
     <action name="ResetWorkingPath">
         <fs>
             <delete path='${workingPath}/downloads'/>
+            <delete path='${workingPath}/last_modified.csv.tar'/>
             <mkdir path='${workingPath}/downloads'/>
         </fs>
         <ok to="DownloadLambdaFile"/>
@@ -41,24 +96,77 @@
             <argument>${shell_cmd}</argument>
             <capture-output/>
         </shell>
-        <ok to="End"/>
+        <ok to="DownloadUpdatedXMLAuthors"/>
         <error to="Kill"/>
     </action>
 
-	<action name="DownloadOrcidData">
+	<action name="DownloadUpdatedXMLAuthors">
         <java>
             <job-tracker>${jobTracker}</job-tracker>
             <name-node>${nameNode}</name-node>
             <main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
-            <arg>-d</arg><arg>${workingPathOrcid}/</arg>
+            <arg>-w</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-f</arg><arg>last_modified.csv</arg>
-            <arg>-o</arg><arg>download/</arg>
+            <arg>-f</arg><arg>last_modified.csv.tar</arg>
+            <arg>-o</arg><arg>downloads/</arg>
             <arg>-t</arg><arg>${token}</arg>
         </java>
         <ok to="End"/>
         <error to="Kill"/>
     </action>
+
+    <action name="GenLastModifiedSeq">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>GenLastModifiedSeq</name>
+            <class>eu.dnetlib.doiboost.orcid.SparkGenLastModifiedSeq</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+            </spark-opts>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>last_modified.csv.tar</arg>
+            <arg>-o</arg><arg>last_modified.seq</arg>
+            <arg>-t</arg><arg>-</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="DownloadOrcidAuthors">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>DownloadOrcidAuthors</name>
+            <class>eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --num-executors=${sparkExecutorNumber}
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+            </spark-opts>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>last_modified.seq</arg>
+            <arg>-o</arg><arg>downloads/updated_authors</arg>
+            <arg>-t</arg><arg>${token}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
     
    <end name="End"/>
 </workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index 774475626..d6ce99f1c 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -5,17 +5,24 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.*;
 import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
 
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
+import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull;
 import org.junit.jupiter.api.Test;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@@ -37,12 +44,49 @@ public class OrcidClientTest {
 //	'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
 
 	@Test
-	public void downloadTest() throws Exception {
-		String record = testDownloadRecord("0000-0001-6163-2042");
-		File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml");
+	private void multipleDownloadTest() throws Exception {
+		int toDownload = 1;
+		long start = System.currentTimeMillis();
+		OrcidDownloader downloader = new OrcidDownloader();
+		TarArchiveInputStream input = new TarArchiveInputStream(
+			new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
+		TarArchiveEntry entry = input.getNextTarEntry();
+		BufferedReader br = null;
+		StringBuilder sb = new StringBuilder();
+		int rowNum = 0;
+		int entryNum = 0;
+		int modified = 0;
+		while (entry != null) {
+			br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
+			String line;
+			while ((line = br.readLine()) != null) {
+				String[] values = line.toString().split(",");
+				List<String> recordInfo = Arrays.asList(values);
+				String orcidId = recordInfo.get(0);
+				if (downloader.isModified(orcidId, recordInfo.get(3))) {
+					downloadTest(orcidId);
+					modified++;
+				}
+				rowNum++;
+				if (modified > toDownload) {
+					break;
+				}
+			}
+			entryNum++;
+			entry = input.getNextTarEntry();
+		}
+		long end = System.currentTimeMillis();
+		logToFile("start test: " + new Date(start).toString());
+		logToFile("end test: " + new Date(end).toString());
+	}
+
+	@Test
+	private void downloadTest(String orcid) throws Exception {
+		String record = testDownloadRecord(orcid);
+		String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
+		File f = new File(filename);
 		OutputStream outStream = new FileOutputStream(f);
 		IOUtils.write(record.getBytes(), outStream);
-		System.out.println("saved to tmp");
 	}
 
 	private String testDownloadRecord(String orcidId) throws Exception {
@@ -50,7 +94,9 @@ public class OrcidClientTest {
 			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
 			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
 			httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
+			logToFile("start connection: " + new Date(System.currentTimeMillis()).toString());
 			CloseableHttpResponse response = client.execute(httpGet);
+			logToFile("end connection: " + new Date(System.currentTimeMillis()).toString());
 			if (response.getStatusLine().getStatusCode() != 200) {
 				System.out
 					.println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
@@ -62,7 +108,7 @@ public class OrcidClientTest {
 		return new String("");
 	}
 
-//	@Test
+	// @Test
 	private void testLambdaFileParser() throws Exception {
 		try (BufferedReader br = new BufferedReader(
 			new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) {
@@ -108,7 +154,7 @@ public class OrcidClientTest {
 		}
 	}
 
-//	@Test
+	// @Test
 	private void getRecordDatestamp() throws ParseException {
 		Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
 		Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
@@ -126,7 +172,7 @@ public class OrcidClientTest {
 		System.out.println(valueDt.toString());
 	}
 
-//	@Test
+	// @Test
 	@Ignore
 	private void testModifiedDate() throws ParseException {
 		testDate(toRetrieveDate);
@@ -134,14 +180,81 @@ public class OrcidClientTest {
 		testDate(shortDate);
 	}
 
-//	@Test
-	@Ignore
-	private void testReadBase64CompressedRecord() throws Exception {
+	@Test
+	public void testReadBase64CompressedRecord() throws Exception {
 		final String base64CompressedRecord = IOUtils
-			.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
+			.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
 		final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
-		System.out.println(recordFromSeqFile);
-		final String downloadedRecord = testDownloadRecord("0000-0001-6645-509X");
+		logToFile("\n\ndownloaded \n\n" + recordFromSeqFile);
+		final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161");
 		assertTrue(recordFromSeqFile.equals(downloadedRecord));
 	}
+
+	@Test
+	private void lambdaFileReaderTest() throws Exception {
+		TarArchiveInputStream input = new TarArchiveInputStream(
+			new GzipCompressorInputStream(new FileInputStream("/develop/last_modified.csv.tar")));
+		TarArchiveEntry entry = input.getNextTarEntry();
+		BufferedReader br = null;
+		StringBuilder sb = new StringBuilder();
+		int rowNum = 0;
+		int entryNum = 0;
+		while (entry != null) {
+			br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
+			String line;
+			while ((line = br.readLine()) != null) {
+				String[] values = line.toString().split(",");
+				List<String> recordInfo = Arrays.asList(values);
+				assertTrue(recordInfo.size() == 4);
+
+				rowNum++;
+				if (rowNum == 1) {
+					assertTrue(recordInfo.get(3).equals("last_modified"));
+				} else if (rowNum == 2) {
+					assertTrue(recordInfo.get(0).equals("0000-0002-0499-7333"));
+				}
+			}
+			entryNum++;
+			assertTrue(entryNum == 1);
+			entry = input.getNextTarEntry();
+		}
+	}
+
+	@Test
+	private void lambdaFileCounterTest() throws Exception {
+		final String lastUpdate = "2020-09-29 00:00:00";
+		OrcidDownloader downloader = new OrcidDownloader();
+		TarArchiveInputStream input = new TarArchiveInputStream(
+			new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
+		TarArchiveEntry entry = input.getNextTarEntry();
+		BufferedReader br = null;
+		StringBuilder sb = new StringBuilder();
+		int rowNum = 0;
+		int entryNum = 0;
+		int modified = 0;
+		while (entry != null) {
+			br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
+			String line;
+			while ((line = br.readLine()) != null) {
+				String[] values = line.toString().split(",");
+				List<String> recordInfo = Arrays.asList(values);
+				String orcidId = recordInfo.get(0);
+				if (downloader.isModified(orcidId, recordInfo.get(3))) {
+					modified++;
+				}
+				rowNum++;
+			}
+			entryNum++;
+			entry = input.getNextTarEntry();
+		}
+		logToFile("rowNum: " + rowNum);
+		logToFile("modified: " + modified);
+	}
+
+	private void logToFile(String log)
+		throws IOException {
+		log = log.concat("\n");
+		Path path = Paths.get("/tmp/orcid_log.txt");
+		Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
+	}
 }
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64 b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64
deleted file mode 100644
index 1b088e061..000000000
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0001-6645-509X.compressed.base64
+++ /dev/null
@@ -1 +0,0 @@
-H4sIAAAAAAAAAO1a227bOBB9z1cIepd18SW24aho0wTbAgEWjRdY9I2RaJtbSdSSkhP165eURIm6kHa2SbCLNkBiWDxzhhxyZg7tbN49xZFxhIQinFyZ7sQxDZgEOETJ/sr8Y3trLU2DZiAJQYQTeGUWkJrv/IsNgQEm4bp6MVKQHa5M22E/Fvt1rcViNrfmzupP02AOErpGSQZJAqIr85Bl6dq2Hx8fJ5gEKGR/93ZCbYEQFjDMA5CV01KZNBBhEyKaoSTQW0mgxg6mbCUgg6HGrMEIK5wdILESEEO1VYsRVjGMH1i8DyhVW7WYJhqEYKKJBB8W2ADHsS4A1bhAV1uoRlfjAp2yaWG2S1YIM4AiqrbrIwXDN1g8ah3WgGblMbPWrJwPN9in6gxZKIRJhnYI6mI2BAueXZ5UGaCyrQFNVAjcQcISB+oC0oKEHQhDAqnGpga0WXRE7ABaKaZIf8j7SMHAIvtNbcVHBfLA0gSTQg2uAe0+pREuYhZK3WYJjLD6OwcRC/2pTO/AhC2F5IgCTfLVgO7ZPXVim71hFYLFEOm2tMW02UQhIAFP+pxojm0X186QvSfwiOCjbpoNSNg95JFmV/lof36MgOKc6KI3gJr+hcF+NlX9WJdgKXmqURmRE+RzdsroW+qRLrGxJYsBDe8uvs6qBAzMDphmfuO2AZePq4XY2pVspISVM1zyJCMiHIAI+jDZ2COPa4dayk2dUSL1JEdiJCCwTAErhtkBh/5d2SiskonAcGOrgEMqmj/EiPK+b4Wsq/me464sZ2l53tadrmeLtXc58ZbLry1n32IQ8QjQzIqZeGBBDAWrx7Ztbrnu1puu59P11JksPfdrE/sRm5FlRwDFMPQzkkNpjfXTIZ4Jmoqv7A49s96gxjolKAak0LN0QfU+j+7kpiowdR3SiCZRieSTVplyIWEcEUUPKEIZK85p/hChwKzJxgRYSyJvVXk+2k0abv187rWb1EGP8o1u/QlW3dZLi24lxHqPjjAp1RT1twgkRb4Z6IwO6ATfDsQoKkqs/xmBETIZ0e6GLW2H9LgVe5I2pLqNlmCmLTF120Ovq2gZe9AOa3lEK0Gl5ag0lWxZ6xAhWPSLEqJFJqhFnVB/WnuB6c59qNbG5J5+XSN44aTZ0+qlftg2eEkPWDSPecprY9Aqg2fUyZnlTLfObD2brZ3pZHm5OLNOStOUbjfaWMi47la3XM39Sh/VBqXkaWTfiWPXwFRMte7W0giMiqMvjbVkA7CKtb2yafkkmIpJ0ndaKhmn4uroZi1bF6niG2jCs2pRi1bx1kpdyyYwKg5+edESlABFP3zplOxPbk9wnnaHX9u9zC9VPjpEKZDjQAXYyooU+iFGzfwGg8+iO4Ioh77rTFzXWdnvr69v7u8nPCYTb7X0PNcZ9VNZPctRgknMjv53GBoZAQlF5Q2Wiz2zcQ8Cdu7oafct1/PmwDp1c1FiISyvSc9dOud4llMCoyrZWTHyKYx2o7Qd1PjJGTEbOYkjqJGjuOFJWqZy22XzzApwyG6qly67kCxWjnkqy+0WOSaWWe9LI1BYKAnhE1PNpj4lelqZp+XUmjpbz1szYTt3JjP38hyt3Od9raSXfVR19/TBqHBWEPHjr8192Wr8gl+RSJuzWi5nlrtyp+P3fJ2H3t1/yNS9++uoTn4eMGpsPztAvZCWd4Rrgillt/Q+XfcCoXGsAJXZkqEsOmOLK9g9K1CR9ZFdnBN+kzdu2WnNCTTuQEbQk3HNMp3VvlIXGnflZwfGDhPjI6y+FDC+wBQyJnbHMm7Ze0iMO3yElba7JTg2biIYZATzzzXSA4jwnoDYuEd7lvK0WZRmyhv71KLOb2oK9Hnn5YWam4ryVRqcytlbNznVPF690akcv1SzK/nPangq5An99W8jpIxKXSP4Gf2LlRI+CUAyFERQZJry+DZFuOyb1eeJ6pYjWxRM95fNrJlf+UQfpPPcVOsRS6nKxKebmxvjfXl+60V1x0fUyEBn9LS7rRfvP6rt64/GVlt3vnYXa8ebLJz5T6jt53ObB8OeLl2m2WZvJurP8fviav4cpz+BjF+4znzqzd3TMr5FvryMP5GBPyjjXyC/ZR+/ZPwvGd+Rzh8IQIl1jWOWVkyDf+L/PLMDATSuDyBJYGTdQ67DuYq/ZxUwg/vC+AAoq4fsyXuWtwVF1MA74+bIA/GFlwc2+BHSIgkOBCfoe1kvjC1OuYRPD4WBSi78DRq/szGu+H/p+ddqaiovb9bYVBN4veam8vj/l+6q0PwnNbu7OkOzy3bslxf3ZWNWPThpF4LC91or/va17gefq3e83v0GQZQdAkCgcZPsUQIhQcn+DW4NnbHyqwjxxaP2S0b/YmN3/tnSv/gH9+klwrUpAAA=
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64 b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64
new file mode 100644
index 000000000..8dc3d32ad
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/0000-0003-3028-6161.compressed.base64
@@ -0,0 +1 @@
+H4sIAAAAAAAAAO1dW5fbthF+z6/A2XPal5biRaREqmvlrG+JG6/t4900bd+4JLSCQxIqSa1X+fUFeIUkAiIlkpFs5jRxTQ1mgAEw881gQF7/+Ox74AmGEcLBiyt1pFwBGDjYRcHji6tf799K5hWIYjtwbQ8H8MXVBkZXP85/uA6hg0N3lv4BVna8fHElK+Qfifw7lsaKZkoTdaJeASIgiGYoiGEY2N6Lq2Ucr2ay/PXr1xEOHeSS/z7KQSTnFHkL6K4dO066xWtSkORtXBTFKHDErRiioh1ckZHYMXQFzQqavBWOlzCUAtuH/FYlTd7Kh/4D0fcSrfitSppCG2GIQ4Em6M85rYN9X6SA9PecOp1CPnX6e069It3CZJYkF8Y28iJ+u13KnMPvcPNVKDAjKEbuk9aCkdOfC9rndA1JyIVBjBYIinS2T5zzWayDdAfw2mYEhVZCuIAh2ThQpJCSKG9nu24II0GbjKDcRU+ILEBphSMkXuS7lDkHotnf+a3orznlkmwTHG74xBlBOU8rD298okrRZOU0eav/rW2PqP7QTt8iy9tGMHxCjmDzZQTba/fQii3mhlgIokMkmtKSptxNEbRDh276dShYttt0ZQ/J30P4hOBXUTcLorzdw9oTzCr9dbd/hEGE16FIe3ukV/MfAPnnOrUfs4SY2TzpryzFOkRzyj0i7EvWFV7iWmZa7LGh3mUuapUQ7DVb4iieF2IL4uRxOhBZOJJrZsOyO5yRxFJ42LE9OIfBtVzxOBMoZHmd7ah86zGC8l+cECZbQPJhvMTu/DZxFFLCKYTutcwj3GcVrR98FFG/L7nEq801RdUlxZK08b2mzDR9NlZHlmX9t+S522JP454dxZJPwANRoptz1RRJVSV1eq+NZwZhrIx0TflvofuKNhXD9mzkQ3ceh2vIjDF7uk9PAE3KL/EOO812fhS0XoXIt8ONmMs2UTbPlTN5nRqYzA4JQFNuiWpqWDUlZSqpk3vVnCnaTLNGxsSqqeGsSxSggCcUoQfkoZgY/dX6wUPOVdbJKmBXMmE7mKw7pmsTSdEl1Ugm35ypxshUpmXXtqgr+VUPWMxVNGBm0CU0mT2iJxgkKC2avwwJ2sV0F4uoDjBc2D7yNgnt/PWacIwr+LFE5YzIzJQwj0sgyeDOSLSIGLIrmeG07Xp2PJaQ4w7pFtdk+adgTcgjxWtsywzj5GBIPKgcELEMMsCYI0th+5xmu+/7SLAKSorHVUHP2SNtb+ImYwCrdSyR+I74fVUxjYkyuRLs+9ojlQtmJLpaefZGQoELn4nl2NGByFaINcC3FV3rluWfIqH93/dpJMdDRD9ES9XUbItqoJQyKOZAkwzL1CTTMsfVeInHfQs/VXHZxk88Ngfx1F5DuZFCdtSX2L87B6/WEZDAGy+iiDfc5bltJavY2cSkhAkUwiF6RPQP5/g5qQ1ea03GYTDb/mQ00QdXh4naM08JcgcnJN7fUfKBLZULZ+yNFG9WxaK4WRNkG4J3rwtOe5S1eD7Z3hrO9SmZBFXVp4pSyS+lqsWQ+MY5E1RFSXdHhJBE5V/t0JXtpOevUxgwIuQ/pk/evX7BdOOvtr/6x8oO4wDSX24/mPcfbz7fVfaOiqzVtxB6SVxAc0vzCHqLSnZbVNt+psr8VzkaFtHU9a9FlMTi5OxhGWozkbkUrX0KvoWIoYzRj49Y1Jrwku0mk2cUIgeWbhsYlbyKTKcYgxRUTZAHO1zdmmnaSB2bDZAHOzOLBcERaeD5GOL1qqGjPrErnEUfyRVkha5K3ZarqcBI+tTSLGMP1ahigJQzlPPmFQhLbHB3oREbmVsUwChjvS406kPrrAwRRNqnO+SO2RYtu2SW9YlumWXV2DUnjeVGWqnCShx3fBgoHXLErEAXUo9EM7gpx1dL6BP7FW4KLrsUQnYh9qAUo9iD80/L0pEzj8VLLSaiBEuSpd2Q0JVupXJKkycH25F/6dIwi2bpg4PtXHsz14xSLfbmkPoPDKawbIFoF1YN2TxqyKp2zJDVJkMWD6VMND/aAfojMamHO5Esul8DlBxqxhuAF+C3DfZRUG5F/rpkGWWphnqb3iGi5u/t0PYRLO0yfVireQgf6eB++0+5BdIn9YTjdUC24PzXEhzmjw4bnIPDLGYXRbb/gB7Xia+pNyn12rOwUdfVSbmCajVpKj1x9amt+/zuw08/fXz/ukoiQ3ZYi02Vw5w9iEivZQFO2UXm9YFYm5htC5uY5H8j3TD+dMymVWSiGmA2rWXMtq+XEzFbC1pnZQyYbcBsR2C2l7azhB4OI+pl7xxEDzMBCsC+hA4RnaL3ieiUc0B0ynGITjGOQnTm+SG6e/hsR8COwc2aJk86R3Y7YhoDu/t/f/fATtXGY2VAdiVZ68hui67MBXIyikUp1oHj2oLs2JxiwxTqbkfbzCke0RWW/0F8WiiLUW8FQlXHmmFYk8Z5xYKnXHI4FaYeUE+LqcUjlc/KGGDqdwRTPy03EXKIlw9ccEPAJA6w30KakejKxU6MQ9sDn7OCFsI/wg4xMrAPwKpax6E3/Rj0pqgdANaT8dvH3z17iX27c+D2AYe+fQJw+/jLmQG3+vBC5IzaABglMqhGGFmNvBBe5DS8c8/dMnYhsz1iHtfyeoWQH0PG48TUYgtZsXQ8Xls17kJu25Q8fnv127Vq+0pqHt+sql7ILafh8aAXDYQMEoJG9XMWrQlVjHuFwoyZYY0svUn9HNO5o7Kgp4ln+bMo80DeoKQ8bDv3imNcjBpUzojYpbUxqjJSVd2StZGiqKqlGF9g1KSURiQhoGbcQ39AF8QEgkQouWZCK7Kv6sstmTQTntfzUGxIlJYgw9pCKytzRNJOrdApeFc0q/ITtVbdNd2Pya6tCGeMiaqa1tgSBi+0rVxSVtWc1igdZZW2m28X78BT8+2n729WRsNApkYpKduqVjmpKlmmqUuqpdYsJ2UlbMU0VZx6jmka1NqyQcurEEdRCBdtRzTJbhEED2x/UrI77NkheLW0w0di4z5DFz+Dtx7+Cl7aMdmYmxm4AfckEoI++LTEMYYedEhI5NBEu0MimZvVKsS2syz6Jwpd5EP9q++KONT1pr4ll8Rj2a5b4knpzTXxOtCde+JJbMtFJfxruSke5QGA1Fg1XHVkW5quwy9k51PpJFRGjgfz3cRdooXNTNxcGli1ny8oLW8tMel48qGkpuCf6d+S85UlBG92TMwddhCMN9l4t5tW4Io9xCCIOQ+UKBwFyLfvfOiTkTYdnzcgh8htFZBrUkQWuWQaim7qk4nFrKMO4XhNqceC8X7hcXuT20m0pVjyp3/dvRqR5T4dmePJZKKbvYVcRPjqKXIOC7+MqW4jEiLrnRgefXw4EiopO4iExKbv1EjodMPKyjiDSKjxxTpWwomX61hWf2YkdMJFO5Zl08t2bNvv4Qju2MuIXVzA2+pYy+FsEqM+YS+2kQNCGK2IfYYUWAZ2TDrreRuAHWcdhih4BBEFmDhw1wTx0b+uvQVyIfBRAAlpCc6HkLYL388T1a//5/Xizwk1Ob05cgkch+t5LNvE9jwZPeF7nviLyy9wb/qmL09LrviSVfBiX3eHLvnWSUgQ25m9SEha2SvqujvPSNSpP20hI0FNC3j35s0boOvxEmx5lbsVdBBZYVEMXhUK+DugNhHQdpebpzD1885TdBHKKupE/jKKsOfbMfUoxkixRophHGPVjvFmjeRfRkDb3oR3lZjSLd0YK5apTPtMTB2QehmT22K2wqidraguN20/W2G2d7e9BfPKyhiyFUO2YshWNNbrhWcrwC8wIlYWxRA44SaKh6REDyiOJ613JMfryJCa6BLd8WT0hPB44r/51ESpuzZSE/3XSpQvj+g0M5HWa70hqnncgFubegfiFZK7KlkpF/Sol7jUHMSk5iuad/lfcg6CTalPRtOpQZxHL5HpbjpdLPwyAtT2prq7shhlbFhT09T7mOOaUi9jclvMPmi1sw/V72RpP/vAmr72sw9NDSsrY8g+DNmHIfvQWK8Xnn0YaiXarJU4CdjxRPUL7ni9GBISXQI+noyeQB9P/DefkCh1d6G1Ej1lJMoiCPA5/SojcSHxEqb1E5zSiYipnbjcZMW0hXeEX2IEq1uqppCtofZSJlFT6mVEsO1NbtuZqMxGus8j9sba7qmGTkDFSNG0Pia+sfxTLtpXMvwOLtGrE0U3FKvG1ZGSsoN0iJFcc58mxnU607XRpP43mFi+1abboKZb05OX1hozXR0Z4/7SIUN4nTSWG2mlKv/R7dvAmgfQb+2HMMNMFPO8cu8hiZ2JEcIusMPQ3kQAP8EQePRyPXkCbYKLCPJ4XNK9RncssXrZhXoX5m8PAgscArhYIPpa2xhEyakOTA98CIrMvpk9BNz9+y+e/N59GK8jx/kxHrfLjfCS90G7iAR2cRLoZU/k7DRYXiEk3ymWNlU03VR1RZko2rhRONf/+bLRTzRX73wZtBSzlb0QH9botQ9r9L4utpqdXmzt7zNIw2HNbuPhsOYw67NBk9/LYc2ANc8Ja353yHI44ukyI8qT0VNWlCf+cgOA2jWnue4us+b0rGKCSznG2csFTvXJeR/j2OEzeuoi1083y5cHJ4XwOIgphtfMqazJ6niijhZrzxut3MXflrHvtZ/wl2AYEvixZ9nq2SnaNgkS56bCvGueed6Ajw+jyH6E81d2EOAYFH0E5TdhQDJWkMzF7CT9bUlsZDD3lPd9HHNwenEGZ2YJBpvqMl0EtyrBXvQGg97L6+5Y0f4B0cO52NHnYmbtczGz83MxazZWR4bZ3ldyWvCFrIzhXGw4F6Nkb+zQ24Abj+y97JsSeAFIJ+kHGglSXDNo8mfsQ6IeO0IR/X7jz+uAlpfGdDAognYEwS3ZjUN9aaeuhyeoT/fD68O3cKTF6c2Ri+nYaIDHtNeIgNeJE6OCLbYnRgb7vM4wOkg6eVqEwJuKb2HHnZ5DEkyc/RCR1enEDNY4x+RQmQTu6+XtLxH28GPyznbmQ8yXmhWyWvhY56XFsuOpKmdTNVrhANJPFZjj8eQYU38Momgkf4hqj4xqVUX8vdcyqk0ou41qVbLh9JFptvii8NP3MitjiGqHqJaSJfEsMVyv8cqm1x7T8w8YYhKtktD1Fsb2A/ZQ5NNA9pY8pef19BLlTbTxVzH2iRFwwOeJrIFb5JQffB0i2o4cEE9a706I15EBae8U6xLN/0V7K9T9eWJttQbWZtv56YeVy5o2n/9h5RYw+qf3+A58/PDmYsG4qZ35N027PaL1wq1Y1NBkS1anujac0NbKwTRV33BAKxzkpQW1qqXLdA3cKupUnfb3napMrC8QO4SwR4ewNa4ElJQdh7DqTJuOpnqLVwJO94CsjCGEHULYpJAPPpNlv8jeu5Acy5LpCOj+osHs78gN4AY8evgBxyGycbTxHBiSaNdNo11vE2Gp+mcS89IS9Q3wh9i2Oz/EE9KXL+LJ/xYiWU5vzvaUtruggNeHb/aQtpsAIenjcEbb4Rktd94u5Ii2Ttqo3SPa92iFXPAZRkSes+whH7T1G2WRTfHW8/L/lgKus0sbs/SP+Q//BxvQAv4zvAAA
\ No newline at end of file

From 5c17e768b26789df1e8bf120eeebab93854a716e Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Mon, 23 Nov 2020 16:01:23 +0100
Subject: [PATCH 32/34] set wf configuration with
 spark.dynamicAllocation.maxExecutors 20 over 20 input partitions

---
 .../orcid/SparkDownloadOrcidAuthors.java      | 39 +++++++++++++------
 .../oozie_app/workflow.xml                    |  4 +-
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
index 850a654d4..68f44541a 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
@@ -65,9 +65,14 @@ public class SparkDownloadOrcidAuthors {
 			spark -> {
 				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
-				LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsedRecords");
-				LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("modifiedRecords");
-				LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloadedRecords");
+				LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
+				LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
+				LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
+				LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
+				LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
+				LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
+				LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
+				LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
 
 				logger.info("Retrieving data from lamda sequence file");
 				JavaPairRDD<Text, Text> lamdaFileRDD = sc
@@ -99,6 +104,18 @@ public class SparkDownloadOrcidAuthors {
 						int statusCode = response.getStatusLine().getStatusCode();
 						downloaded.setStatusCode(statusCode);
 						if (statusCode != 200) {
+							switch (statusCode) {
+								case 403:
+									errorHTTP403Acc.add(1);
+								case 409:
+									errorHTTP409Acc.add(1);
+								case 503:
+									errorHTTP503Acc.add(1);
+								case 525:
+									errorHTTP525Acc.add(1);
+								default:
+									errorHTTPGenericAcc.add(1);
+							}
 							logger
 								.info(
 									"Downloading " + orcidId + " status code: "
@@ -106,10 +123,6 @@ public class SparkDownloadOrcidAuthors {
 							return downloaded.toTuple2();
 						}
 						downloadedRecordsAcc.add(1);
-						long currentDownloaded = downloadedRecordsAcc.value();
-						if ((currentDownloaded % 10000) == 0) {
-							logger.info("Current downloaded: " + currentDownloaded);
-						}
 						downloaded
 							.setCompressedData(
 								ArgumentApplicationParser
@@ -125,14 +138,11 @@ public class SparkDownloadOrcidAuthors {
 				sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
 
 				logger.info("Start execution ...");
-//				List<Tuple2<Text, Text>> sampleList = lamdaFileRDD.take(500);
-//				JavaRDD<Tuple2<Text, Text>> sampleRDD = sc.parallelize(sampleList);
-//				sampleRDD
-				JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD
-					.filter(isModifiedAfterFilter);
+				JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
 				logger.info("Authors modified count: " + authorsModifiedRDD.count());
 				logger.info("Start downloading ...");
 				authorsModifiedRDD
+					.repartition(20)
 					.map(downloadRecordFunction)
 					.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
 					.saveAsNewAPIHadoopFile(
@@ -144,6 +154,11 @@ public class SparkDownloadOrcidAuthors {
 				logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
 				logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
 				logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
+				logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
+				logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
+				logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
+				logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
+				logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
 			});
 
 	}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
index 5f728d35b..1c2a7b588 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
@@ -149,9 +149,9 @@
             <class>eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors</class>
             <jar>dhp-doiboost-${projectVersion}.jar</jar>
             <spark-opts>
-                --num-executors=${sparkExecutorNumber}
+                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                 --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
                 --driver-memory=${sparkDriverMemory}
                 --conf spark.extraListeners=${spark2ExtraListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

From 99a086f0c68f17489e4b39e32e9fbbb24418b21d Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 24 Nov 2020 17:49:32 +0100
Subject: [PATCH 33/34] max concurrent executors set to 10, according to ORCID
 Director of Technology mail request

---
 .../orcid/SparkDownloadOrcidAuthors.java      | 17 +++++--
 .../oozie_app/workflow.xml                    |  6 +--
 .../doiboost/orcid/OrcidClientTest.java       | 47 +++++++++++++++++--
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
index 68f44541a..598835a00 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
@@ -100,7 +100,13 @@ public class SparkDownloadOrcidAuthors {
 						HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
 						httpGet.addHeader("Accept", "application/vnd.orcid+xml");
 						httpGet.addHeader("Authorization", String.format("Bearer %s", token));
+						long startReq = System.currentTimeMillis();
 						CloseableHttpResponse response = client.execute(httpGet);
+						long endReq = System.currentTimeMillis();
+						long reqTime = endReq - startReq;
+						if (reqTime < 1000) {
+							Thread.sleep(1000 - reqTime);
+						}
 						int statusCode = response.getStatusLine().getStatusCode();
 						downloaded.setStatusCode(statusCode);
 						if (statusCode != 200) {
@@ -111,15 +117,16 @@ public class SparkDownloadOrcidAuthors {
 									errorHTTP409Acc.add(1);
 								case 503:
 									errorHTTP503Acc.add(1);
+									throw new RuntimeException("Orcid request rate limit reached (HTTP 503)");
 								case 525:
 									errorHTTP525Acc.add(1);
 								default:
 									errorHTTPGenericAcc.add(1);
+									logger
+										.info(
+											"Downloading " + orcidId + " status code: "
+												+ response.getStatusLine().getStatusCode());
 							}
-							logger
-								.info(
-									"Downloading " + orcidId + " status code: "
-										+ response.getStatusLine().getStatusCode());
 							return downloaded.toTuple2();
 						}
 						downloadedRecordsAcc.add(1);
@@ -142,7 +149,7 @@ public class SparkDownloadOrcidAuthors {
 				logger.info("Authors modified count: " + authorsModifiedRDD.count());
 				logger.info("Start downloading ...");
 				authorsModifiedRDD
-					.repartition(20)
+					.repartition(10)
 					.map(downloadRecordFunction)
 					.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
 					.saveAsNewAPIHadoopFile(
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
index 1c2a7b588..b9383558c 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
@@ -14,10 +14,6 @@
             </value>
             <description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
         </property>
-        <property>
-            <name>sparkExecutorNumber</name>
-            <value>20</value>
-        </property>
         <property>
             <name>sparkDriverMemory</name>
             <value>7G</value>
@@ -35,7 +31,7 @@
         </property>
         <property>
             <name>spark2MaxExecutors</name>
-            <value>20</value>
+            <value>10</value>
         </property>
         <property>
             <name>oozieActionShareLibForSpark2</name>
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index d6ce99f1c..66a7badb7 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -10,6 +10,9 @@ import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
+import java.time.Duration;
+import java.time.LocalDateTime;
+import java.time.temporal.TemporalUnit;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
@@ -24,6 +27,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull;
 import org.junit.jupiter.api.Test;
+import org.mortbay.log.Log;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import jdk.nashorn.internal.ir.annotations.Ignore;
@@ -45,7 +49,7 @@ public class OrcidClientTest {
 
 	@Test
 	private void multipleDownloadTest() throws Exception {
-		int toDownload = 1;
+		int toDownload = 10;
 		long start = System.currentTimeMillis();
 		OrcidDownloader downloader = new OrcidDownloader();
 		TarArchiveInputStream input = new TarArchiveInputStream(
@@ -64,7 +68,7 @@ public class OrcidClientTest {
 				List<String> recordInfo = Arrays.asList(values);
 				String orcidId = recordInfo.get(0);
 				if (downloader.isModified(orcidId, recordInfo.get(3))) {
-					downloadTest(orcidId);
+					slowedDownDownload(orcidId);
 					modified++;
 				}
 				rowNum++;
@@ -181,7 +185,7 @@ public class OrcidClientTest {
 	}
 
 	@Test
-	public void testReadBase64CompressedRecord() throws Exception {
+	private void testReadBase64CompressedRecord() throws Exception {
 		final String base64CompressedRecord = IOUtils
 			.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
 		final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
@@ -257,4 +261,41 @@ public class OrcidClientTest {
 		Path path = Paths.get("/tmp/orcid_log.txt");
 		Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
 	}
+
+	@Test
+	private void slowedDownDownloadTest() throws Exception {
+		String orcid = "0000-0001-5496-1243";
+		String record = slowedDownDownload(orcid);
+		String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
+		File f = new File(filename);
+		OutputStream outStream = new FileOutputStream(f);
+		IOUtils.write(record.getBytes(), outStream);
+	}
+
+	private String slowedDownDownload(String orcidId) throws Exception {
+		try (CloseableHttpClient client = HttpClients.createDefault()) {
+			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
+			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
+			httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
+			long start = System.currentTimeMillis();
+			CloseableHttpResponse response = client.execute(httpGet);
+			long endReq = System.currentTimeMillis();
+			long reqSessionDuration = endReq - start;
+			logToFile("req time (millisec): " + reqSessionDuration);
+			if (reqSessionDuration < 1000) {
+				logToFile("wait ....");
+				Thread.sleep(1000 - reqSessionDuration);
+			}
+			long end = System.currentTimeMillis();
+			long total = end - start;
+			logToFile("total time (millisec): " + total);
+			if (response.getStatusLine().getStatusCode() != 200) {
+				logToFile("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
+			}
+			return IOUtils.toString(response.getEntity().getContent());
+		} catch (Throwable e) {
+			e.printStackTrace();
+		}
+		return new String("");
+	}
 }

From 40c4559e923c5e2f0124477dc8deb8bfe2ef22d5 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Mon, 30 Nov 2020 14:19:22 +0100
Subject: [PATCH 34/34] added datainfo on authors pid with
 "sysimport:crosswalk:entityregistry",

---
 .../doiboost/orcidnodoi/oaf/PublicationToOaf.java  | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
index ece59c3f1..18fecc6c2 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java
@@ -144,7 +144,7 @@ public class PublicationToOaf implements Serializable {
 
 		publication.setLastupdatetimestamp(new Date().getTime());
 
-		publication.setDateofcollection("2019-10-22");
+		publication.setDateofcollection("2020-10-14");
 		publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
 
 		// Adding external ids
@@ -526,6 +526,18 @@ public class PublicationToOaf implements Serializable {
 		q.setSchemeid(ModelConstants.DNET_PID_TYPES);
 		q.setSchemename(ModelConstants.DNET_PID_TYPES);
 		sp.setQualifier(q);
+		final DataInfo dataInfo = new DataInfo();
+		dataInfo.setDeletedbyinference(false);
+		dataInfo.setInferred(false);
+		dataInfo.setTrust("0.9");
+		dataInfo
+				.setProvenanceaction(
+						mapQualifier(
+								"sysimport:crosswalk:entityregistry",
+								"Harvested",
+								"dnet:provenanceActions",
+								"dnet:provenanceActions"));
+		sp.setDataInfo(dataInfo);
 		return sp;
 	}
 }