merged from master

2021-01-11 16:16:20 +01:00 · 2021-01-11 16:16:20 +01:00 · 47db46ebae
parent c88ec1bc52 41500669e2
commit 47db46ebae
309 changed files with 15905 additions and 3355 deletions
--- a/dhp-build/dhp-code-style/pom.xml
+++ b/dhp-build/dhp-code-style/pom.xml
@ -15,12 +15,12 @@
        <snapshotRepository>
            <id>dnet45-snapshots</id>
            <name>DNet45 Snapshots</name>
-            <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
            <layout>default</layout>
        </snapshotRepository>
        <repository>
            <id>dnet45-releases</id>
-            <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
        </repository>
    </distributionManagement>

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.common;
+
+import java.util.Map;
+
+import com.google.common.collect.Maps;
+
+public class Constants {
+
+	public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
+	public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
+
+	public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
+
+	static {
+		accessRightsCoarMap.put("OPEN", "c_abf2");
+		accessRightsCoarMap.put("RESTRICTED", "c_16ec");
+		accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
+		accessRightsCoarMap.put("CLOSED", "c_14cb");
+		accessRightsCoarMap.put("EMBARGO", "c_f1cf");
+	}
+
+	static {
+		coarCodeLabelMap.put("c_abf2", "OPEN");
+		coarCodeLabelMap.put("c_16ec", "RESTRICTED");
+		coarCodeLabelMap.put("c_14cb", "CLOSED");
+		coarCodeLabelMap.put("c_f1cf", "EMBARGO");
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/GraphResultMapper.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/GraphResultMapper.java
@ -0,0 +1,412 @@
+
+package eu.dnetlib.dhp.common;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.dump.oaf.*;
+import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
+import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.Journal;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public class GraphResultMapper implements Serializable {
+
+	public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
+		E in) {
+
+		CommunityResult out = new CommunityResult();
+
+		eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
+		Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
+		if (ort.isPresent()) {
+			switch (ort.get().getClassid()) {
+				case "publication":
+					Optional<Journal> journal = Optional
+						.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
+					if (journal.isPresent()) {
+						Journal j = journal.get();
+						Container c = new Container();
+						c.setConferencedate(j.getConferencedate());
+						c.setConferenceplace(j.getConferenceplace());
+						c.setEdition(j.getEdition());
+						c.setEp(j.getEp());
+						c.setIss(j.getIss());
+						c.setIssnLinking(j.getIssnLinking());
+						c.setIssnOnline(j.getIssnOnline());
+						c.setIssnPrinted(j.getIssnPrinted());
+						c.setName(j.getName());
+						c.setSp(j.getSp());
+						c.setVol(j.getVol());
+						out.setContainer(c);
+						out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
+					}
+					break;
+				case "dataset":
+					eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
+					Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
+					Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
+
+					out
+						.setGeolocation(
+							Optional
+								.ofNullable(id.getGeolocation())
+								.map(
+									igl -> igl
+										.stream()
+										.filter(Objects::nonNull)
+										.map(gli -> {
+											GeoLocation gl = new GeoLocation();
+											gl.setBox(gli.getBox());
+											gl.setPlace(gli.getPlace());
+											gl.setPoint(gli.getPoint());
+											return gl;
+										})
+										.collect(Collectors.toList()))
+								.orElse(null));
+
+					out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
+					break;
+				case "software":
+
+					eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
+					Optional
+						.ofNullable(is.getCodeRepositoryUrl())
+						.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
+					Optional
+						.ofNullable(is.getDocumentationUrl())
+						.ifPresent(
+							value -> out
+								.setDocumentationUrl(
+									value
+										.stream()
+										.map(v -> v.getValue())
+										.collect(Collectors.toList())));
+
+					Optional
+						.ofNullable(is.getProgrammingLanguage())
+						.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
+
+					out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
+					break;
+				case "other":
+
+					eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
+					out
+						.setContactgroup(
+							Optional
+								.ofNullable(ir.getContactgroup())
+								.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
+								.orElse(null));
+
+					out
+						.setContactperson(
+							Optional
+								.ofNullable(ir.getContactperson())
+								.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
+								.orElse(null));
+					out
+						.setTool(
+							Optional
+								.ofNullable(ir.getTool())
+								.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
+								.orElse(null));
+
+					out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
+
+					break;
+			}
+
+			Optional
+				.ofNullable(input.getAuthor())
+				.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
+
+			// I do not map Access Right UNKNOWN or OTHER
+
+			Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
+			if (oar.isPresent()) {
+				if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
+					String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
+					out
+						.setBestaccessright(
+							AccessRight
+								.newInstance(
+									code,
+									Constants.coarCodeLabelMap.get(code),
+									Constants.COAR_ACCESS_RIGHT_SCHEMA));
+				}
+			}
+
+			final List<String> contributorList = new ArrayList<>();
+			Optional
+				.ofNullable(input.getContributor())
+				.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
+			out.setContributor(contributorList);
+
+			Optional
+				.ofNullable(input.getCountry())
+				.ifPresent(
+					value -> out
+						.setCountry(
+							value
+								.stream()
+								.map(
+									c -> {
+										if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
+											return null;
+										}
+										Country country = new Country();
+										country.setCode(c.getClassid());
+										country.setLabel(c.getClassname());
+										Optional
+											.ofNullable(c.getDataInfo())
+											.ifPresent(
+												provenance -> country
+													.setProvenance(
+														Provenance
+															.newInstance(
+																provenance
+																	.getProvenanceaction()
+																	.getClassname(),
+																c.getDataInfo().getTrust())));
+										return country;
+									})
+								.filter(Objects::nonNull)
+								.collect(Collectors.toList())));
+
+			final List<String> coverageList = new ArrayList<>();
+			Optional
+				.ofNullable(input.getCoverage())
+				.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
+			out.setCoverage(coverageList);
+
+			out.setDateofcollection(input.getDateofcollection());
+
+			final List<String> descriptionList = new ArrayList<>();
+			Optional
+				.ofNullable(input.getDescription())
+				.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
+			out.setDescription(descriptionList);
+			Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
+			if (oStr.isPresent()) {
+				out.setEmbargoenddate(oStr.get().getValue());
+			}
+
+			final List<String> formatList = new ArrayList<>();
+			Optional
+				.ofNullable(input.getFormat())
+				.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
+			out.setFormat(formatList);
+			out.setId(input.getId());
+			out.setOriginalId(input.getOriginalId());
+
+			Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
+				.ofNullable(input.getInstance());
+
+			if (oInst.isPresent()) {
+				out
+					.setInstance(
+						oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
+
+			}
+
+			Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
+			if (oL.isPresent()) {
+				eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
+				out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
+			}
+			Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
+			if (oLong.isPresent()) {
+				out.setLastupdatetimestamp(oLong.get());
+			}
+			Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
+			if (otitle.isPresent()) {
+				List<StructuredProperty> iTitle = otitle
+					.get()
+					.stream()
+					.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
+					.collect(Collectors.toList());
+				if (iTitle.size() > 0) {
+					out.setMaintitle(iTitle.get(0).getValue());
+				}
+
+				iTitle = otitle
+					.get()
+					.stream()
+					.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
+					.collect(Collectors.toList());
+				if (iTitle.size() > 0) {
+					out.setSubtitle(iTitle.get(0).getValue());
+				}
+
+			}
+
+			List<ControlledField> pids = new ArrayList<>();
+			Optional
+				.ofNullable(input.getPid())
+				.ifPresent(
+					value -> value
+						.stream()
+						.forEach(
+							p -> pids
+								.add(
+									ControlledField
+										.newInstance(p.getQualifier().getClassid(), p.getValue()))));
+			out.setPid(pids);
+			oStr = Optional.ofNullable(input.getDateofacceptance());
+			if (oStr.isPresent()) {
+				out.setPublicationdate(oStr.get().getValue());
+			}
+			oStr = Optional.ofNullable(input.getPublisher());
+			if (oStr.isPresent()) {
+				out.setPublisher(oStr.get().getValue());
+			}
+
+			List<String> sourceList = new ArrayList<>();
+			Optional
+				.ofNullable(input.getSource())
+				.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
+			// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
+			List<Subject> subjectList = new ArrayList<>();
+			Optional
+				.ofNullable(input.getSubject())
+				.ifPresent(
+					value -> value
+						.forEach(s -> subjectList.add(getSubject(s))));
+
+			out.setSubjects(subjectList);
+
+			out.setType(input.getResulttype().getClassid());
+		}
+
+		out
+			.setCollectedfrom(
+				input
+					.getCollectedfrom()
+					.stream()
+					.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
+					.collect(Collectors.toList()));
+
+		return out;
+
+	}
+
+	private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
+		CommunityInstance instance = new CommunityInstance();
+
+		setCommonValue(i, instance);
+
+		instance
+			.setCollectedfrom(
+				KeyValue
+					.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
+
+		instance
+			.setHostedby(
+				KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
+
+		return instance;
+
+	}
+
+	private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
+		Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
+			.ofNullable(i.getAccessright());
+		if (opAr.isPresent()) {
+			if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
+				String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
+				instance
+					.setAccessright(
+						AccessRight
+							.newInstance(
+								code,
+								Constants.coarCodeLabelMap.get(code),
+								Constants.COAR_ACCESS_RIGHT_SCHEMA));
+			}
+		}
+
+		Optional
+			.ofNullable(i.getLicense())
+			.ifPresent(value -> instance.setLicense(value.getValue()));
+		Optional
+			.ofNullable(i.getDateofacceptance())
+			.ifPresent(value -> instance.setPublicationdate(value.getValue()));
+		Optional
+			.ofNullable(i.getRefereed())
+			.ifPresent(value -> instance.setRefereed(value.getClassname()));
+		Optional
+			.ofNullable(i.getInstancetype())
+			.ifPresent(value -> instance.setType(value.getClassname()));
+		Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
+
+	}
+
+	private static Subject getSubject(StructuredProperty s) {
+		Subject subject = new Subject();
+		subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
+		Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
+		if (di.isPresent()) {
+			Provenance p = new Provenance();
+			p.setProvenance(di.get().getProvenanceaction().getClassname());
+			p.setTrust(di.get().getTrust());
+			subject.setProvenance(p);
+		}
+
+		return subject;
+	}
+
+	private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
+		Author a = new Author();
+		a.setFullname(oa.getFullname());
+		a.setName(oa.getName());
+		a.setSurname(oa.getSurname());
+		a.setRank(oa.getRank());
+
+		Optional<List<StructuredProperty>> oPids = Optional
+			.ofNullable(oa.getPid());
+		if (oPids.isPresent()) {
+			Pid pid = getOrcid(oPids.get());
+			if (pid != null) {
+				a.setPid(pid);
+			}
+		}
+
+		return a;
+	}
+
+	private static Pid getOrcid(List<StructuredProperty> p) {
+		for (StructuredProperty pid : p) {
+			if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
+				Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
+				if (di.isPresent()) {
+					return Pid
+						.newInstance(
+							ControlledField
+								.newInstance(
+									pid.getQualifier().getClassid(),
+									pid.getValue()),
+							Provenance
+								.newInstance(
+									di.get().getProvenanceaction().getClassname(),
+									di.get().getTrust()));
+				} else {
+					return Pid
+						.newInstance(
+							ControlledField
+								.newInstance(
+									pid.getQualifier().getClassid(),
+									pid.getValue())
+
+						);
+				}
+
+			}
+		}
+		return null;
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
@ -90,9 +90,6 @@ public class MakeTarArchive implements Serializable {
 		String p_string = p.toString();
 		if (!p_string.endsWith("_SUCCESS")) {
 			String name = p_string.substring(p_string.lastIndexOf("/") + 1);
-			if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
-				name = "communities_infrastructures.json";
-			}
 			TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
 			entry.setSize(fileStatus.getLen());
 			current_size += fileStatus.getLen();
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;

 import org.apache.commons.lang3.StringUtils;

@ -32,27 +33,33 @@ public class AuthorMerger {

 	}

-	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
 		int pa = countAuthorsPids(a);
 		int pb = countAuthorsPids(b);
 		List<Author> base, enrich;
 		int sa = authorsSize(a);
 		int sb = authorsSize(b);

-		if (pa == pb) {
-			base = sa > sb ? a : b;
-			enrich = sa > sb ? b : a;
-		} else {
+		if (sa == sb) {
 			base = pa > pb ? a : b;
 			enrich = pa > pb ? b : a;
+		} else {
+			base = sa > sb ? a : b;
+			enrich = sa > sb ? b : a;
 		}
-		enrichPidFromList(base, enrich);
+		enrichPidFromList(base, enrich, threshold);
 		return base;
 	}

-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+		return mergeAuthor(a, b, THRESHOLD);
+	}
+
+	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
 		if (base == null || enrich == null)
 			return;
+
+		// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
 		final Map<String, Author> basePidAuthorMap = base
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -63,6 +70,7 @@ public class AuthorMerger {
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));

+		// <pid, Author> (list of pid that are missing in the other list)
 		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -83,10 +91,10 @@ public class AuthorMerger {
 						.max(Comparator.comparing(Tuple2::_1));

 					if (simAuthor.isPresent()) {
-						double th = THRESHOLD;
+						double th = threshold;
 						// increase the threshold if the surname is too short
 						if (simAuthor.get()._2().getSurname() != null
-							&& simAuthor.get()._2().getSurname().length() <= 3)
+							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
 							th = 0.99;

 						if (simAuthor.get()._1() > th) {
@ -156,7 +164,7 @@ public class AuthorMerger {
 	}

 	private static String normalize(final String s) {
-		return nfd(s)
+		String[] normalized = nfd(s)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
 			// in case
@ -166,7 +174,12 @@ public class AuthorMerger {
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
-			.trim();
+			.trim()
+			.split(" ");
+
+		Arrays.sort(normalized);
+
+		return String.join(" ", normalized);
 	}

 	private static String nfd(final String s) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.schema.oaf;

 import java.util.ArrayList;
 import java.util.Arrays;
@ -13,19 +13,43 @@ import java.util.stream.Collectors;

 import org.apache.commons.lang3.StringUtils;

-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.Journal;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
-import eu.dnetlib.dhp.schema.oaf.OriginDescription;
-import eu.dnetlib.dhp.schema.oaf.Qualifier;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.utils.DHPUtils;

 public class OafMapperUtils {

+	public static Oaf merge(final Oaf o1, final Oaf o2) {
+		if (ModelSupport.isSubClass(o1, OafEntity.class)) {
+			if (ModelSupport.isSubClass(o1, Result.class)) {
+
+				return mergeResults((Result) o1, (Result) o2);
+			} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
+				((Datasource) o1).mergeFrom((Datasource) o2);
+			} else if (ModelSupport.isSubClass(o1, Organization.class)) {
+				((Organization) o1).mergeFrom((Organization) o2);
+			} else if (ModelSupport.isSubClass(o1, Project.class)) {
+				((Project) o1).mergeFrom((Project) o2);
+			} else {
+				throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
+			}
+		} else if (ModelSupport.isSubClass(o1, Relation.class)) {
+			((Relation) o1).mergeFrom((Relation) o2);
+		} else {
+			throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
+		}
+		return o1;
+	}
+
+	public static Result mergeResults(Result r1, Result r2) {
+		if (new ResultTypeComparator().compare(r1, r2) < 0) {
+			r1.mergeFrom(r2);
+			return r1;
+		} else {
+			r2.mergeFrom(r1);
+			return r2;
+		}
+	}
+
 	public static KeyValue keyValue(final String k, final String v) {
 		final KeyValue kv = new KeyValue();
 		kv.setKey(k);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java
@ -0,0 +1,49 @@
+
+package eu.dnetlib.dhp.schema.oaf;
+
+import java.util.Comparator;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+
+public class ResultTypeComparator implements Comparator<Result> {
+
+	@Override
+	public int compare(Result left, Result right) {
+
+		if (left == null && right == null)
+			return 0;
+		if (left == null)
+			return 1;
+		if (right == null)
+			return -1;
+
+		String lClass = left.getResulttype().getClassid();
+		String rClass = right.getResulttype().getClassid();
+
+		if (lClass.equals(rClass))
+			return 0;
+
+		if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
+			return -1;
+		if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
+			return 1;
+
+		if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
+			return -1;
+		if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
+			return 1;
+
+		if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
+			return -1;
+		if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
+			return 1;
+
+		if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
+			return -1;
+		if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
+			return 1;
+
+		// Else (but unlikely), lexicographical ordering will do.
+		return lClass.compareTo(rClass);
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@ -5,6 +5,7 @@ import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
+import java.util.List;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;

@ -15,9 +16,15 @@ import org.apache.commons.codec.binary.Hex;
 import com.jayway.jsonpath.JsonPath;

 import net.minidev.json.JSONArray;
+import scala.collection.JavaConverters;
+import scala.collection.Seq;

 public class DHPUtils {

+	public static Seq<String> toSeq(List<String> list) {
+		return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
+	}
+
 	public static String md5(final String s) {
 		try {
 			final MessageDigest md = MessageDigest.getInstance("MD5");
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
@ -0,0 +1,100 @@
+
+package eu.dnetlib.dhp.oa.merge;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import scala.Tuple2;
+
+public class AuthorMergerTest {
+
+	private String publicationsBasePath;
+
+	private List<List<Author>> authors;
+
+	@BeforeEach
+	public void setUp() throws Exception {
+
+		publicationsBasePath = Paths
+			.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
+			.toFile()
+			.getAbsolutePath();
+
+		authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
+			.stream()
+			.map(p -> p._2().getAuthor())
+			.collect(Collectors.toList());
+
+	}
+
+	@Test
+	public void mergeTest() { // used in the dedup: threshold set to 0.95
+
+		for (List<Author> authors1 : authors) {
+			System.out.println("List " + (authors.indexOf(authors1) + 1));
+			for (Author author : authors1) {
+				System.out.println(authorToString(author));
+			}
+		}
+
+		List<Author> merge = AuthorMerger.merge(authors);
+
+		System.out.println("Merge ");
+		for (Author author : merge) {
+			System.out.println(authorToString(author));
+		}
+
+		Assertions.assertEquals(7, merge.size());
+
+	}
+
+	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+		List<Tuple2<String, T>> res = new ArrayList<>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(path));
+			String line = reader.readLine();
+			while (line != null) {
+				res
+					.add(
+						new Tuple2<>(
+							MapDocumentUtil.getJPathString("$.id", line),
+							new ObjectMapper().readValue(line, clazz)));
+				// read next line
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+
+		return res;
+	}
+
+	public String authorToString(Author a) {
+
+		String print = "Fullname = ";
+		print += a.getFullname() + " pid = [";
+		if (a.getPid() != null)
+			for (StructuredProperty sp : a.getPid()) {
+				print += sp.toComparableString() + " ";
+			}
+		print += "]";
+		return print;
+	}
+}
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@ -6,7 +6,7 @@
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp</artifactId>
        <version>1.2.4-SNAPSHOT</version>
-        <relativePath>../</relativePath>
+        <relativePath>../pom.xml</relativePath>
    </parent>

    <artifactId>dhp-schemas</artifactId>
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
@ -7,6 +7,10 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;

 public class ModelConstants {

+	public static final String ORCID = "orcid";
+	public static final String ORCID_PENDING = "orcid_pending";
+	public static final String ORCID_CLASSNAME = "Open Researcher and Contributor ID";
+
 	public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
 	public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
 	public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";
@ -101,6 +105,8 @@ public class ModelConstants {
 	public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
 		"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");

+	public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE);
+
 	private static Qualifier qualifier(
 		final String classid,
 		final String classname,
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
@ -2,8 +2,12 @@
 package eu.dnetlib.dhp.schema.oaf;

 import java.io.Serializable;
+import java.util.Collection;
 import java.util.List;
 import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;

 public abstract class Oaf implements Serializable {

@ -40,9 +44,34 @@ public abstract class Oaf implements Serializable {
 		this.lastupdatetimestamp = lastupdatetimestamp;
 	}

-	public void mergeOAFDataInfo(Oaf e) {
-		if (e.getDataInfo() != null && compareTrust(this, e) < 0)
-			dataInfo = e.getDataInfo();
+	public void mergeFrom(Oaf o) {
+		if (Objects.isNull(o)) {
+			return;
+		}
+		setCollectedfrom(
+			Stream
+				.concat(
+					Optional
+						.ofNullable(getCollectedfrom())
+						.map(Collection::stream)
+						.orElse(Stream.empty()),
+					Optional
+						.ofNullable(o.getCollectedfrom())
+						.map(Collection::stream)
+						.orElse(Stream.empty()))
+				.distinct() // relies on KeyValue.equals
+				.collect(Collectors.toList()));
+
+		setLastupdatetimestamp(
+			Math
+				.max(
+					Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
+					Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
+	}
+
+	public void mergeOAFDataInfo(Oaf o) {
+		if (o.getDataInfo() != null && compareTrust(this, o) < 0)
+			dataInfo = o.getDataInfo();
 	}

 	protected String extractTrust(Oaf e) {
@ -62,7 +91,7 @@ public abstract class Oaf implements Serializable {
 		if (o == null || getClass() != o.getClass())
 			return false;
 		Oaf oaf = (Oaf) o;
-		return Objects.equals(dataInfo, oaf.dataInfo)
+		return Objects.equals(getDataInfo(), oaf.getDataInfo())
 			&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
 	}

--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java
@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
 	}

 	public void mergeFrom(OafEntity e) {
-
-		if (e == null)
-			return;
+		super.mergeFrom(e);

 		originalId = mergeLists(originalId, e.getOriginalId());

-		collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
-
 		pid = mergeLists(pid, e.getPid());

 		if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java
@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
 			? p.getFundedamount()
 			: fundedamount;

-		// programme = mergeLists(programme, p.getProgramme());
-
 		h2020classification = mergeLists(h2020classification, p.getH2020classification());

 		mergeOAFDataInfo(e);
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
@ -130,19 +130,7 @@ public class Relation extends Oaf {
 			Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
 		checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");

-		setCollectedfrom(
-			Stream
-				.concat(
-					Optional
-						.ofNullable(getCollectedfrom())
-						.map(Collection::stream)
-						.orElse(Stream.empty()),
-					Optional
-						.ofNullable(r.getCollectedfrom())
-						.map(Collection::stream)
-						.orElse(Stream.empty()))
-				.distinct() // relies on KeyValue.equals
-				.collect(Collectors.toList()));
+		super.mergeFrom(r);
 	}

 	@Override
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
@ -243,7 +243,7 @@ public class Result extends OafEntity implements Serializable {

 		Result r = (Result) e;

-		// TODO consider merging also Measures
+		measures = mergeLists(measures, r.getMeasures());

 		instance = mergeLists(instance, r.getInstance());

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java
@ -1,7 +1,14 @@

-package eu.dnetlib.doiboost.orcid.model;
+package eu.dnetlib.dhp.schema.orcid;

 import java.io.Serializable;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+/**
+ * This class models the data that are retrieved from orcid publication
+ */

 public class AuthorData implements Serializable {

@ -10,6 +17,7 @@ public class AuthorData implements Serializable {
 	private String surname;
 	private String creditName;
 	private String errorCode;
+	private List<String> otherNames;

 	public String getErrorCode() {
 		return errorCode;
@ -50,4 +58,15 @@ public class AuthorData implements Serializable {
 	public void setOid(String oid) {
 		this.oid = oid;
 	}
+
+	public List<String> getOtherNames() {
+		return otherNames;
+	}
+
+	public void setOtherNames(List<String> otherNames) {
+		if (this.otherNames == null) {
+			this.otherNames = Lists.newArrayList();
+		}
+		this.otherNames = otherNames;
+	}
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java
@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.schema.orcid;
+
+import java.util.List;
+
+public class OrcidDOI {
+	private String doi;
+	private List<AuthorData> authors;
+
+	public String getDoi() {
+		return doi;
+	}
+
+	public void setDoi(String doi) {
+		this.doi = doi;
+	}
+
+	public List<AuthorData> getAuthors() {
+		return authors;
+	}
+
+	public void setAuthors(List<AuthorData> authors) {
+		this.authors = authors;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipDeserialize.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipDeserialize.java
@ -0,0 +1,28 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ * Class that maps the model of the bipFinder! input data.
+ * Only needed for deserialization purposes
+ */
+
+public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
+
+	public BipDeserialize() {
+		super();
+	}
+
+	public List<Score> get(String key) {
+
+		if (super.get(key) == null) {
+			return new ArrayList<>();
+		}
+		return super.get(key);
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipScore.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipScore.java
@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
+ */
+
+public class BipScore implements Serializable {
+	private String id; // doi
+	private List<Score> scoreList; // unit as given in the inputfile
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public List<Score> getScoreList() {
+		return scoreList;
+	}
+
+	public void setScoreList(List<Score> scoreList) {
+		this.scoreList = scoreList;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java
@ -0,0 +1,85 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.Serializable;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+/**
+ * Just collects all the atomic actions produced for the different results and saves them in
+ * outputpath for the ActionSet
+ */
+public class CollectAndSave implements Serializable {
+
+	private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static <I extends Result> void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				CollectAndSave.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}: ", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		SparkConf conf = new SparkConf();
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				removeOutputDir(spark, outputPath);
+				collectAndSave(spark, inputPath, outputPath);
+			});
+	}
+
+	private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
+		JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+		sc
+			.sequenceFile(inputPath + "/publication", Text.class, Text.class)
+			.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
+			.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
+			.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+		;
+	}
+
+	private static void removeOutputDir(SparkSession spark, String path) {
+		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/KeyValue.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/KeyValue.java
@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.io.Serializable;
+
+public class KeyValue implements Serializable {
+
+	private String key;
+	private String value;
+
+	public String getKey() {
+		return key;
+	}
+
+	public void setKey(String key) {
+		this.key = key;
+	}
+
+	public String getValue() {
+		return value;
+	}
+
+	public void setValue(String value) {
+		this.value = value;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
@ -0,0 +1,28 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.io.Serializable;
+
+/**
+ * Subset of the information of the generic results that are needed to create the atomic action
+ */
+public class PreparedResult implements Serializable {
+	private String id; // openaire id
+	private String value; // doi
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public String getValue() {
+		return value;
+	}
+
+	public void setValue(String value) {
+		this.value = value;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Score.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Score.java
@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * represents the score in the input file
+ */
+public class Score implements Serializable {
+
+	private String id;
+	private List<KeyValue> unit;
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public List<KeyValue> getUnit() {
+		return unit;
+	}
+
+	public void setUnit(List<KeyValue> unit) {
+		this.unit = unit;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -0,0 +1,200 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.MapGroupsFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import scala.Tuple2;
+
+/**
+ * created the Atomic Action for each tipe of results
+ */
+public class SparkAtomicActionScoreJob implements Serializable {
+
+	private static String DOI = "doi";
+	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static <I extends Result> void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				SparkAtomicActionScoreJob.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}: ", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		final String bipScorePath = parser.get("bipScorePath");
+		log.info("bipScorePath: {}", bipScorePath);
+
+		final String resultClassName = parser.get("resultTableName");
+		log.info("resultTableName: {}", resultClassName);
+
+		Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
+
+		SparkConf conf = new SparkConf();
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				removeOutputDir(spark, outputPath);
+				prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
+			});
+	}
+
+	private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
+		String bipScorePath, Class<I> inputClazz) {
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+			.textFile(bipScorePath)
+			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+
+		Dataset<BipScore> bipScores = spark
+			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
+				BipScore bs = new BipScore();
+				bs.setId(key);
+				bs.setScoreList(entry.get(key));
+				return bs;
+			}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
+
+		System.out.println(bipScores.count());
+
+		Dataset<I> results = readPath(spark, inputPath, inputClazz);
+
+		results.createOrReplaceTempView("result");
+
+		Dataset<PreparedResult> preparedResult = spark
+			.sql(
+				"select pIde.value value, id " +
+					"from result " +
+					"lateral view explode (pid) p as pIde " +
+					"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
+			.as(Encoders.bean(PreparedResult.class));
+
+		bipScores
+			.joinWith(
+				preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
+				"inner")
+			.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
+				BipScore ret = value._1();
+				ret.setId(value._2().getId());
+				return ret;
+			}, Encoders.bean(BipScore.class))
+			.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
+			.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
+				Result ret = new Result();
+				ret.setDataInfo(getDataInfo());
+				BipScore first = it.next();
+				ret.setId(first.getId());
+
+				ret.setMeasures(getMeasure(first));
+				it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
+
+				return ret;
+			}, Encoders.bean(Result.class))
+			.toJavaRDD()
+			.map(p -> new AtomicAction(inputClazz, p))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+
+	}
+
+	private static List<Measure> getMeasure(BipScore value) {
+		return value
+			.getScoreList()
+			.stream()
+			.map(score -> {
+				Measure m = new Measure();
+				m.setId(score.getId());
+				m
+					.setUnit(
+						score
+							.getUnit()
+							.stream()
+							.map(unit -> {
+								KeyValue kv = new KeyValue();
+								kv.setValue(unit.getValue());
+								kv.setKey(unit.getKey());
+								kv.setDataInfo(getDataInfo());
+								return kv;
+							})
+							.collect(Collectors.toList()));
+				return m;
+			})
+			.collect(Collectors.toList());
+	}
+
+	private static DataInfo getDataInfo() {
+		DataInfo di = new DataInfo();
+		di.setInferred(false);
+		di.setInvisible(false);
+		di.setDeletedbyinference(false);
+		di.setTrust("");
+		Qualifier qualifier = new Qualifier();
+		qualifier.setClassid("sysimport:actionset");
+		qualifier.setClassname("Harvested");
+		qualifier.setSchemename("dnet:provenanceActions");
+		qualifier.setSchemeid("dnet:provenanceActions");
+		di.setProvenanceaction(qualifier);
+		return di;
+	}
+
+	private static void removeOutputDir(SparkSession spark, String path) {
+		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+	}
+
+	public static <R> Dataset<R> readPath(
+		SparkSession spark, String inputPath, Class<R> clazz) {
+		return spark
+			.read()
+			.textFile(inputPath)
+			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "ip",
+    "paramLongName": "inputPath",
+    "paramDescription": "the URL from where to get the programme file",
+    "paramRequired": true
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json
@ -0,0 +1,32 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "ip",
+    "paramLongName": "inputPath",
+    "paramDescription": "the URL from where to get the programme file",
+    "paramRequired": true
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  },
+  {
+    "paramName": "rtn",
+    "paramLongName": "resultTableName",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  },
+  {
+    "paramName": "bsp",
+    "paramLongName": "bipScorePath",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
@ -0,0 +1,171 @@
+<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>inputPath</name>
+            <description>the input path of the resources to be extended</description>
+        </property>
+
+        <property>
+            <name>bipScorePath</name>
+            <description>the path where to find the bipFinder scores</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the path where to store the actionset</description>
+        </property>
+    </parameters>
+
+    <start to="deleteoutputpath"/>
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="deleteoutputpath">
+        <fs>
+            <delete path="${outputPath}"/>
+            <mkdir path="${outputPath}"/>
+            <delete path="${workingDir}"/>
+            <mkdir path="${workingDir}"/>
+        </fs>
+        <ok to="atomicactions"/>
+        <error to="Kill"/>
+    </action>
+
+    <fork name="atomicactions">
+        <path start="atomicactions_publication"/>
+        <path start="atomicactions_dataset"/>
+        <path start="atomicactions_orp"/>
+        <path start="atomicactions_software"/>
+    </fork>
+
+    <action name="atomicactions_publication">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the atomic action with the bip finder scores for publications</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
+            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/publication</arg>
+            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
+        </spark>
+        <ok to="join_aa"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="atomicactions_dataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the atomic action with the bip finder scores for datasets</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
+            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
+            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
+        </spark>
+        <ok to="join_aa"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="atomicactions_orp">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the atomic action with the bip finder scores for orp</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
+            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
+            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
+        </spark>
+        <ok to="join_aa"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="atomicactions_software">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the atomic action with the bip finder scores for software</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
+            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/software</arg>
+            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
+        </spark>
+        <ok to="join_aa"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="join_aa" to="collectandsave"/>
+
+    <action name="collectandsave">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>saves all the aa produced for the several types of results in the as output path</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingDir}</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@ -0,0 +1,323 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+
+public class SparkAtomicActionScoreJobTest {
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static SparkSession spark;
+
+	private static Path workingDir;
+	private static final Logger log = LoggerFactory
+		.getLogger(SparkAtomicActionScoreJobTest.class);
+
+	@BeforeAll
+	public static void beforeAll() throws IOException {
+		workingDir = Files
+			.createTempDirectory(SparkAtomicActionScoreJobTest.class.getSimpleName());
+		log.info("using work dir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(SparkAtomicActionScoreJobTest.class.getSimpleName());
+
+		conf.setMaster("local[*]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(SparkAtomicActionScoreJobTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void afterAll() throws IOException {
+		FileUtils.deleteDirectory(workingDir.toFile());
+		spark.stop();
+	}
+
+	@Test
+	public void matchOne() throws Exception {
+		String bipScoresPath = getClass()
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
+			.getPath();
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
+			.getPath();
+
+		SparkAtomicActionScoreJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-bipScorePath",
+					bipScoresPath,
+					"-resultTableName",
+					"eu.dnetlib.dhp.schema.oaf.Publication",
+					"-outputPath",
+					workingDir.toString() + "/actionSet"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Publication> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Publication) aa.getPayload()));
+
+		Assertions.assertTrue(tmp.count() == 1);
+
+		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
+		verificationDataset.createOrReplaceTempView("publication");
+
+		Dataset<Row> execVerification = spark
+			.sql(
+				"Select p.id oaid, mes.id, mUnit.value from publication p " +
+					"lateral view explode(measures) m as mes " +
+					"lateral view explode(mes.unit) u as mUnit ");
+
+		Assertions.assertEquals(2, execVerification.count());
+
+		Assertions
+			.assertEquals(
+				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
+				execVerification.select("oaid").collectAsList().get(0).getString(0));
+
+		Assertions
+			.assertEquals(
+				"1.47565045883e-08",
+				execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
+
+		Assertions
+			.assertEquals(
+				"0.227515392",
+				execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
+
+	}
+
+	@Test
+	public void matchOneWithTwo() throws Exception {
+		String bipScoresPath = getClass()
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
+			.getPath();
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
+			.getPath();
+
+		SparkAtomicActionScoreJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-bipScorePath",
+					bipScoresPath,
+					"-resultTableName",
+					"eu.dnetlib.dhp.schema.oaf.Publication",
+					"-outputPath",
+					workingDir.toString() + "/actionSet"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Publication> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Publication) aa.getPayload()));
+
+		Assertions.assertTrue(tmp.count() == 1);
+
+		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
+		verificationDataset.createOrReplaceTempView("publication");
+
+		Dataset<Row> execVerification = spark
+			.sql(
+				"Select p.id oaid, mes.id, mUnit.value from publication p " +
+					"lateral view explode(measures) m as mes " +
+					"lateral view explode(mes.unit) u as mUnit ");
+
+		Assertions.assertEquals(4, execVerification.count());
+
+		Assertions
+			.assertEquals(
+				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
+				execVerification.select("oaid").collectAsList().get(0).getString(0));
+
+		Assertions
+			.assertEquals(
+				2,
+				execVerification.filter("id = 'influence'").count());
+
+		Assertions
+			.assertEquals(
+				2,
+				execVerification.filter("id = 'popularity'").count());
+
+		List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
+		String tmp_influence = tmp_ds.get(0).getString(0);
+		Assertions
+			.assertTrue(
+				"1.47565045883e-08".equals(tmp_influence) ||
+					"1.98956540239e-08".equals(tmp_influence));
+
+		tmp_influence = tmp_ds.get(1).getString(0);
+		Assertions
+			.assertTrue(
+				"1.47565045883e-08".equals(tmp_influence) ||
+					"1.98956540239e-08".equals(tmp_influence));
+
+		Assertions.assertTrue(!tmp_ds.get(0).getString(0).equals(tmp_ds.get(1).getString(0)));
+
+	}
+
+	@Test
+	public void matchTwo() throws Exception {
+		String bipScoresPath = getClass()
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
+			.getPath();
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
+			.getPath();
+
+		SparkAtomicActionScoreJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-bipScorePath",
+					bipScoresPath,
+					"-resultTableName",
+					"eu.dnetlib.dhp.schema.oaf.Publication",
+					"-outputPath",
+					workingDir.toString() + "/actionSet"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Publication> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Publication) aa.getPayload()));
+
+		Assertions.assertTrue(tmp.count() == 2);
+
+		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
+		verificationDataset.createOrReplaceTempView("publication");
+
+		Dataset<Row> execVerification = spark
+			.sql(
+				"Select p.id oaid, mes.id, mUnit.value from publication p " +
+					"lateral view explode(measures) m as mes " +
+					"lateral view explode(mes.unit) u as mUnit ");
+
+		Assertions.assertEquals(4, execVerification.count());
+
+		Assertions
+			.assertEquals(
+				2,
+				execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
+
+		Assertions
+			.assertEquals(
+				2,
+				execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
+
+		Assertions
+			.assertEquals(
+				2,
+				execVerification.filter("id = 'influence'").count());
+
+		Assertions
+			.assertEquals(
+				2,
+				execVerification.filter("id = 'popularity'").count());
+
+		Assertions
+			.assertEquals(
+				"1.47565045883e-08",
+				execVerification
+					.filter(
+						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
+							"and id = 'influence'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+
+		Assertions
+			.assertEquals(
+				"1.98956540239e-08",
+				execVerification
+					.filter(
+						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
+							"and id = 'influence'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+
+		Assertions
+			.assertEquals(
+				"0.282046161584",
+				execVerification
+					.filter(
+						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
+							"and id = 'popularity'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+
+		Assertions
+			.assertEquals(
+				"0.227515392",
+				execVerification
+					.filter(
+						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
+							"and id = 'popularity'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java
@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
 			IOUtils
 				.toString(
 					CheckDuplictedIdsJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
+						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
 		parser.parseArgument(args);

 		final SparkConf conf = new SparkConf();

-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

-		final String countPath = parser.get("workingPath") + "/counts";
+		final String countPath = parser.get("outputDir") + "/counts";
 		log.info("countPath: {}", countPath);

 		final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
 			.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
 			.write()
 			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(countPath);
 		;

--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java
@ -44,10 +44,10 @@ public class GenerateEventsJob {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String eventsPath = workingPath + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

 		final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
@ -59,6 +59,9 @@ public class GenerateEventsJob {
 		final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
 		log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));

+		final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
+		log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
+
 		final SparkConf conf = new SparkConf();

 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -70,12 +73,12 @@ public class GenerateEventsJob {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");

 			final Dataset<ResultGroup> groups = ClusterUtils
-				.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
+				.readPath(spark, workingDir + "/duplicates", ResultGroup.class);

 			final Dataset<Event> dataset = groups
 				.map(
 					g -> EventFinder
-						.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
+						.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
 					Encoders
 						.bean(EventGroup.class))
 				.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java
@ -46,7 +46,7 @@ public class GenerateStatsJob {

 		final SparkConf conf = new SparkConf();

-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

 		final String dbUrl = parser.get("dbUrl");
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java
@ -46,7 +46,7 @@ public class IndexEventSubsetJob {

 		final SparkConf conf = new SparkConf();

-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

 		final String index = parser.get("index");
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);

+		final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
+		log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
+
+		final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
+		log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
+
+		final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
+		log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
+
+		final String esNodesWanOnly = parser.get("esNodesWanOnly");
+		log.info("esNodesWanOnly: {}", esNodesWanOnly);
+
 		final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
 		log.info("maxEventsForTopic: {}", maxEventsForTopic);

@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
 		esCfg.put("es.index.auto.create", "false");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
-		esCfg.put("es.batch.write.retry.count", "8");
-		esCfg.put("es.batch.write.retry.wait", "60s");
-		esCfg.put("es.batch.size.entries", "200");
-		esCfg.put("es.nodes.wan.only", "true");
+		esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
+		esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
+		esCfg.put("es.batch.size.entries", esBatchSizeEntries);
+		esCfg.put("es.nodes.wan.only", esNodesWanOnly);

 		log.info("*** Start indexing");
 		JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
@ -54,7 +54,7 @@ public class IndexNotificationsJob {

 		final SparkConf conf = new SparkConf();

-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

 		final String index = parser.get("index");
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);

+		final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
+		log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
+
+		final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
+		log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
+
+		final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
+		log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
+
+		final String esNodesWanOnly = parser.get("esNodesWanOnly");
+		log.info("esNodesWanOnly: {}", esNodesWanOnly);
+
 		final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
 		log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);

@ -92,10 +104,10 @@ public class IndexNotificationsJob {
 			esCfg.put("es.index.auto.create", "false");
 			esCfg.put("es.nodes", indexHost);
 			esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
-			esCfg.put("es.batch.write.retry.count", "8");
-			esCfg.put("es.batch.write.retry.wait", "60s");
-			esCfg.put("es.batch.size.entries", "200");
-			esCfg.put("es.nodes.wan.only", "true");
+			esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
+			esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
+			esCfg.put("es.batch.size.entries", esBatchSizeEntries);
+			esCfg.put("es.nodes.wan.only", esNodesWanOnly);

 			log.info("*** Start indexing");
 			JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
@ -36,7 +36,7 @@ public class IndexOnESJob {

 		final SparkConf conf = new SparkConf();

-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

 		final String index = parser.get("index");
@ -45,6 +45,18 @@ public class IndexOnESJob {
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);

+		final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
+		log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
+
+		final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
+		log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
+
+		final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
+		log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
+
+		final String esNodesWanOnly = parser.get("esNodesWanOnly");
+		log.info("esNodesWanOnly: {}", esNodesWanOnly);
+
 		final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();

 		final JavaRDD<String> inputRdd = ClusterUtils
@ -53,15 +65,13 @@ public class IndexOnESJob {
 			.javaRDD();

 		final Map<String, String> esCfg = new HashMap<>();
-		// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
-
 		esCfg.put("es.index.auto.create", "false");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
-		esCfg.put("es.batch.write.retry.count", "8");
-		esCfg.put("es.batch.write.retry.wait", "60s");
-		esCfg.put("es.batch.size.entries", "200");
-		esCfg.put("es.nodes.wan.only", "true");
+		esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
+		esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
+		esCfg.put("es.batch.size.entries", esBatchSizeEntries);
+		esCfg.put("es.nodes.wan.only", esNodesWanOnly);

 		JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java
@ -42,10 +42,10 @@ public class JoinStep0Job {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);

 		final SparkConf conf = new SparkConf();
@ -57,10 +57,10 @@ public class JoinStep0Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");

 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);

 			final Dataset<RelatedDatasource> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
+				.readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);

 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java
@ -40,10 +40,10 @@ public class JoinStep1Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);

 		final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep1Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");

 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);

 			final Dataset<RelatedProject> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
+				.readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);

 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java
@ -39,10 +39,10 @@ public class JoinStep2Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);

 		final SparkConf conf = new SparkConf();
@ -54,10 +54,10 @@ public class JoinStep2Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");

 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);

 			final Dataset<RelatedSoftware> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
+				.readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);

 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java
@ -40,10 +40,10 @@ public class JoinStep3Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);

 		final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep3Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");

 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);

 			final Dataset<RelatedDataset> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
+				.readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);

 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java
@ -40,10 +40,10 @@ public class JoinStep4Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);

 		final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep4Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");

 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);

 			final Dataset<RelatedPublication> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
+				.readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);

 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java
@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;

+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.slf4j.Logger;
@ -29,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 public class PartitionEventsByDsIdJob {

 	private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
-	private static final String OPENDOAR_NSPREFIX = "opendoar____::";
+	private static final String OPENDOAR_NSPREFIX = "10|opendoar____::";

 	public static void main(final String[] args) throws Exception {

@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
 			IOUtils
 				.toString(
 					PartitionEventsByDsIdJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
+						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
 		parser.parseArgument(args);

 		final Boolean isSparkSessionManaged = Optional
@ -48,24 +55,43 @@ public class PartitionEventsByDsIdJob {

 		final SparkConf conf = new SparkConf();

-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);

-		final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
+		final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
 		log.info("partitionPath: {}", partitionPath);

+		final String opendoarIds = parser.get("opendoarIds");
+		log.info("opendoarIds: {}", opendoarIds);
+
+		final Set<String> validOpendoarIds = new HashSet<>();
+		if (!opendoarIds.trim().equals("-")) {
+			validOpendoarIds
+				.addAll(
+					Arrays
+						.stream(opendoarIds.split(","))
+						.map(String::trim)
+						.filter(StringUtils::isNotBlank)
+						.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
+						.collect(Collectors.toSet()));
+		}
+		log.info("validOpendoarIds: {}", validOpendoarIds);
+
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {

 			ClusterUtils
 				.readPath(spark, eventsPath, Event.class)
-				.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
-				.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
-				.limit(10000)
-				.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
+				.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
+				.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
+				.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
+				.map(
+					(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
+					Encoders.bean(ShortEventMessageWithGroupId.class))
 				.coalesce(1)
 				.write()
 				.partitionBy("group")
 				.mode(SaveMode.Overwrite)
+				.option("compression", "gzip")
 				.json(partitionPath);

 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java
@ -45,10 +45,10 @@ public class PrepareGroupsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String groupsPath = workingPath + "/duplicates";
+		final String groupsPath = workingDir + "/duplicates";
 		log.info("groupsPath: {}", groupsPath);

 		final SparkConf conf = new SparkConf();
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");

 			final Dataset<OaBrokerMainEntity> results = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);

 			final Dataset<Relation> mergedRels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));

 			final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String relsPath = workingPath + "/relatedDatasets";
+		final String relsPath = workingDir + "/relatedDatasets";
 		log.info("relsPath: {}", relsPath);

 		final SparkConf conf = new SparkConf();
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
 				.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));

 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
 				.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
 			final Dataset<RelatedDataset> dataset = rels
 				.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(t -> {
-					final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
+					final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
+						t._2);
 					rel.getRelDataset().setRelType(t._1.getRelClass());
 					return rel;
 				}, Encoders.bean(RelatedDataset.class));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java
@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String relsPath = workingPath + "/relatedDatasources";
+		final String relsPath = workingDir + "/relatedDatasources";
 		log.info("relsPath: {}", relsPath);

 		final SparkConf conf = new SparkConf();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String relsPath = workingPath + "/relatedProjects";
+		final String relsPath = workingDir + "/relatedProjects";
 		log.info("relsPath: {}", relsPath);

 		final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
 				.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));

 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
 				.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String relsPath = workingPath + "/relatedPublications";
+		final String relsPath = workingDir + "/relatedPublications";
 		log.info("relsPath: {}", relsPath);

 		final SparkConf conf = new SparkConf();
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
 					Encoders.bean(OaBrokerRelatedPublication.class));

 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
 				.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
 			final Dataset<RelatedPublication> dataset = rels
 				.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(t -> {
-					final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
+					final RelatedPublication rel = new RelatedPublication(
+						t._1.getSource(), t._2);
 					rel.getRelPublication().setRelType(t._1.getRelClass());
 					return rel;
 				}, Encoders.bean(RelatedPublication.class));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String relsPath = workingPath + "/relatedSoftwares";
+		final String relsPath = workingDir + "/relatedSoftwares";
 		log.info("relsPath: {}", relsPath);

 		final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
 				.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));

 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
 				.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);

-		final String workingPath = parser.get("workingPath");
-		log.info("workingPath: {}", workingPath);
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);

-		final String simpleEntitiesPath = workingPath + "/simpleEntities";
+		final String simpleEntitiesPath = workingDir + "/simpleEntities";
 		log.info("simpleEntitiesPath: {}", simpleEntitiesPath);

 		final SparkConf conf = new SparkConf();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java
@ -5,12 +5,16 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

+import org.apache.commons.lang3.StringUtils;
+
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;

 public class EnrichMissingAbstract extends UpdateMatcher<String> {

+	private static final int MIN_LENGTH = 200;
+
 	public EnrichMissingAbstract() {
 		super(1,
 			s -> Topic.ENRICH_MISSING_ABSTRACT,
@ -21,10 +25,15 @@ public class EnrichMissingAbstract extends UpdateMatcher<String> {
 	@Override
 	protected List<String> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
 		if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
-			return Arrays.asList(source.getAbstracts().get(0));
-		} else {
-			return new ArrayList<>();
+			return source
+				.getAbstracts()
+				.stream()
+				.filter(s -> StringUtils.normalizeSpace(s).length() >= MIN_LENGTH)
+				.map(Arrays::asList)
+				.findFirst()
+				.orElse(new ArrayList<>());
 		}
+		return new ArrayList<>();
 	}

 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java
@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {

 	public EnrichMissingSubject() {
 		super(20,
-			s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
+			s -> {
+				switch (s.getType().toLowerCase()) {
+					case "acm":
+						return Topic.ENRICH_MISSING_SUBJECT_ACM;
+					case "arxiv":
+						return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
+					case "ddc":
+						return Topic.ENRICH_MISSING_SUBJECT_DDC;
+					case "jel":
+						return Topic.ENRICH_MISSING_SUBJECT_JEL;
+					case "mesh":
+						return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
+					case "rvk":
+						return Topic.ENRICH_MISSING_SUBJECT_RVK;
+					default:
+						return null;
+				}
+			},
 			(p, s) -> p.getSubjects().add(s),
 			s -> subjectAsString(s));
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java
@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {

 	public EnrichMoreSubject() {
 		super(20,
-			s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
+			s -> {
+				switch (s.getType().toLowerCase()) {
+					case "acm":
+						return Topic.ENRICH_MORE_SUBJECT_ACM;
+					case "arxiv":
+						return Topic.ENRICH_MORE_SUBJECT_ARXIV;
+					case "ddc":
+						return Topic.ENRICH_MORE_SUBJECT_DDC;
+					case "jel":
+						return Topic.ENRICH_MORE_SUBJECT_JEL;
+					case "mesh":
+						return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
+					case "rvk":
+						return Topic.ENRICH_MORE_SUBJECT_RVK;
+					default:
+						return null;
+				}
+			},
 			(p, s) -> p.getSubjects().add(s),
 			s -> subjectAsString(s));
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.oaf.Relation;

 public class ClusterUtils {

@ -30,6 +31,16 @@ public class ClusterUtils {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}

+	public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
+		return ClusterUtils
+			.readPath(spark, graphPath + "/relation", Relation.class)
+			.map(r -> {
+				r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
+				r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
+				return r;
+			}, Encoders.bean(Relation.class));
+	}
+
 	public static <R> Dataset<R> readPath(
 		final SparkSession spark,
 		final String inputPath,
@ -67,6 +78,7 @@ public class ClusterUtils {
 			.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(path);
 	}

--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
@ -74,7 +74,7 @@ public class ConversionUtils {
 		}

 		final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
-		res.setOpenaireId(d.getId());
+		res.setOpenaireId(cleanOpenaireId(d.getId()));
 		res.setOriginalId(first(d.getOriginalId()));
 		res.setTitle(structPropValue(d.getTitle()));
 		res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -89,7 +89,7 @@ public class ConversionUtils {
 		}

 		final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
-		res.setOpenaireId(p.getId());
+		res.setOpenaireId(cleanOpenaireId(p.getId()));
 		res.setOriginalId(first(p.getOriginalId()));
 		res.setTitle(structPropValue(p.getTitle()));
 		res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -106,7 +106,7 @@ public class ConversionUtils {

 		final OaBrokerMainEntity res = new OaBrokerMainEntity();

-		res.setOpenaireId(result.getId());
+		res.setOpenaireId(cleanOpenaireId(result.getId()));
 		res.setOriginalId(first(result.getOriginalId()));
 		res.setTypology(classId(result.getResulttype()));
 		res.setTitles(structPropList(result.getTitle()));
@ -129,6 +129,10 @@ public class ConversionUtils {
 		return res;
 	}

+	public static String cleanOpenaireId(final String id) {
+		return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
+	}
+
 	private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
 		if (author == null) {
 			return null;
@ -188,7 +192,7 @@ public class ConversionUtils {
 		}

 		final OaBrokerProject res = new OaBrokerProject();
-		res.setOpenaireId(p.getId());
+		res.setOpenaireId(cleanOpenaireId(p.getId()));
 		res.setTitle(fieldValue(p.getTitle()));
 		res.setAcronym(fieldValue(p.getAcronym()));
 		res.setCode(fieldValue(p.getCode()));
@ -214,7 +218,7 @@ public class ConversionUtils {
 		}

 		final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
-		res.setOpenaireId(sw.getId());
+		res.setOpenaireId(cleanOpenaireId(sw.getId()));
 		res.setName(structPropValue(sw.getTitle()));
 		res.setDescription(fieldValue(sw.getDescription()));
 		res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
@ -230,7 +234,7 @@ public class ConversionUtils {

 		final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
 		res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
-		res.setOpenaireId(ds.getId());
+		res.setOpenaireId(cleanOpenaireId(ds.getId()));
 		res.setType(classId(ds.getDatasourcetype()));
 		return res;
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java
@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
 		final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
 		collectedFromSet
 			.stream()
-			.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
+			.map(
+				s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
+					BrokerConstants.COLLECTED_FROM_REL))
 			.forEach(res::addTuple);
-		hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
+
+		hostedBySet
+			.stream()
+			.map(
+				s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
+					BrokerConstants.HOSTED_BY_REL))
+			.forEach(res::addTuple);
+
 		return res;
 	}

--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java
@ -76,6 +76,7 @@ public class EventFinder {
 		final Set<String> dsIdWhitelist,
 		final Set<String> dsIdBlacklist,
 		final Set<String> dsTypeWhitelist,
+		final Set<String> topicWhitelist,
 		final Map<String, LongAccumulator> accumulators) {

 		final List<UpdateInfo<?>> list = new ArrayList<>();
@ -84,7 +85,13 @@ public class EventFinder {
 			for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
 				if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
 					for (final UpdateMatcher<?> matcher : matchers) {
-						list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
+						for (final UpdateInfo<?> info : matcher
+							.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
+							if (topicWhitelist == null || topicWhitelist.isEmpty()
+								|| topicWhitelist.contains(info.getTopic().getPath())) {
+								list.add(info);
+							}
+						}
 					}
 				}
 			}
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json
@ -0,0 +1,9 @@
+[
+
+	{
+		"paramName": "o",
+		"paramLongName": "outputDir",
+		"paramDescription": "the path where the data are stored",
+		"paramRequired": true
+	}
+]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json
@ -7,7 +7,7 @@
 	},
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "workingDir",
 		"paramDescription": "the path where the temporary data will be stored",
 		"paramRequired": true
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
@ -6,7 +6,7 @@
            <description>the path where the graph is stored</description>
        </property>
        <property>
-            <name>workingPath</name>
+            <name>outputDir</name>
            <description>the path where the the generated data will be stored</description>
        </property>
 		<property>
@ -24,6 +24,11 @@
            <value>-</value>
            <description>a black list (comma separeted, - for empty list) of datasource ids</description>
        </property>
+        <property>
+            <name>topicWhitelist</name>
+            <value>*</value>
+            <description>a white list (comma separeted, * for all) of topics</description>
+        </property>
        <property>
            <name>esEventIndexName</name>
            <description>the elasticsearch index name for events</description>
@ -36,6 +41,26 @@
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
+        <property>
+            <name>esBatchWriteRetryCount</name>
+            <value>8</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esBatchWriteRetryWait</name>
+            <value>60s</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esBatchSizeEntries</name>
+            <value>200</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esNodesWanOnly</name>
+            <value>true</value>
+            <description>an ES configuration property</description>
+        </property>
        <property>
        	<name>maxIndexedEventsForDsAndTopic</name>
        	<description>the max number of events for each couple (ds/topic)</description>
@ -111,15 +136,15 @@
        </configuration>
    </global>

-    <start to="ensure_working_path"/>
+    <start to="ensure_output_dir"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    
-    <action name="ensure_working_path">
+    <action name="ensure_output_dir">
        <fs>
-            <mkdir path='${workingPath}'/>
+            <mkdir path='${outputDir}'/>
        </fs>
        <ok to="start_entities_and_rels"/>
        <error to="Kill"/>
@ -152,7 +177,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -176,7 +201,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -201,7 +226,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -225,7 +250,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -249,7 +274,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -273,7 +298,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -299,7 +324,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step1"/>
        <error to="Kill"/>
@ -323,7 +348,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step2"/>
        <error to="Kill"/>
@ -347,7 +372,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step3"/>
        <error to="Kill"/>
@ -371,7 +396,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step4"/>
        <error to="Kill"/>
@ -395,7 +420,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="prepare_groups"/>
        <error to="Kill"/>
@ -419,7 +444,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="generate_events"/>
        <error to="Kill"/>
@ -442,10 +467,12 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
 			<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
 			<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
 			<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
+			<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
        </spark>
        <ok to="index_event_subset"/>
        <error to="Kill"/>
@ -468,9 +495,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--index</arg><arg>${esEventIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
+            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
+            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
+            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
+            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
@ -495,9 +526,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--index</arg><arg>${esNotificationsIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
+            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
+            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
+            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
+            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
        <ok to="stats"/>
@ -521,7 +556,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
            <arg>--dbUser</arg><arg>${brokerDbUser}</arg>
            <arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json
@ -1,7 +1,13 @@
 [
+	{
+		"paramName": "wp",
+		"paramLongName": "workingDir",
+		"paramDescription": "the path where the temporary data are stored",
+		"paramRequired": true
+	},
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "outputDir",
 		"paramDescription": "the path where the generated events will be stored",
 		"paramRequired": true
 	},
@ -22,5 +28,11 @@
 		"paramLongName": "datasourceIdBlacklist",
 		"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
 		"paramRequired": true
+	},
+	{
+		"paramName": "topicWhitelist",
+		"paramLongName": "topicWhitelist",
+		"paramDescription": "a white list (comma separeted, * for all) of topics",
+		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json
@ -1,8 +1,8 @@
 [
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
-		"paramDescription": "the workinh path",
+		"paramLongName": "outputDir",
+		"paramDescription": "the data path",
 		"paramRequired": true
 	},
 	{
@ -16,5 +16,29 @@
 		"paramLongName": "esHost",
 		"paramDescription": "the ES host",
 		"paramRequired": true
+	},
+	{
+		"paramName": "esBatchWriteRetryCount",
+		"paramLongName": "esBatchWriteRetryCount",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esBatchWriteRetryWait",
+		"paramLongName": "esBatchWriteRetryWait",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esBatchSizeEntries",
+		"paramLongName": "esBatchSizeEntries",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esNodesWanOnly",
+		"paramLongName": "esNodesWanOnly",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json
@ -1,8 +1,8 @@
 [
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
-		"paramDescription": "the workinh path",
+		"paramLongName": "outputDir",
+		"paramDescription": "the path where the generated data are stored",
 		"paramRequired": true
 	},
 	{
@ -16,7 +16,31 @@
 		"paramLongName": "esHost",
 		"paramDescription": "the ES host",
 		"paramRequired": true
+	},	
+	{
+		"paramName": "esBatchWriteRetryCount",
+		"paramLongName": "esBatchWriteRetryCount",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
 	},
+	{
+		"paramName": "esBatchWriteRetryWait",
+		"paramLongName": "esBatchWriteRetryWait",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esBatchSizeEntries",
+		"paramLongName": "esBatchSizeEntries",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esNodesWanOnly",
+		"paramLongName": "esNodesWanOnly",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},	
 	{
 		"paramName": "n",
 		"paramLongName": "maxEventsForTopic",
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json
@ -1,8 +1,8 @@
 [
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
-		"paramDescription": "the workinh path",
+		"paramLongName": "outputDir",
+		"paramDescription": "the dir that contains the events folder",
 		"paramRequired": true
 	},
 	{
@ -17,6 +17,30 @@
 		"paramDescription": "the ES host",
 		"paramRequired": true
 	},
+	{
+		"paramName": "esBatchWriteRetryCount",
+		"paramLongName": "esBatchWriteRetryCount",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esBatchWriteRetryWait",
+		"paramLongName": "esBatchWriteRetryWait",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esBatchSizeEntries",
+		"paramLongName": "esBatchSizeEntries",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
+	{
+		"paramName": "esNodesWanOnly",
+		"paramLongName": "esNodesWanOnly",
+		"paramDescription": "an ES configuration property",
+		"paramRequired": true
+	},
 	{
 		"paramName": "broker",
 		"paramLongName": "brokerApiBaseUrl",
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="update broker notifications" xmlns="uri:oozie:workflow:0.5">

    <parameters>
        <property>
@ -6,8 +6,8 @@
            <description>the path where the graph is stored</description>
        </property>
        <property>
-            <name>workingPath</name>
-            <description>the path where the the generated data will be stored</description>
+            <name>outputDir</name>
+            <description>the path where the the generated data are stored</description>
        </property>
 		<property>
            <name>datasourceIdWhitelist</name>
@ -36,6 +36,26 @@
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
+        <property>
+            <name>esBatchWriteRetryCount</name>
+            <value>8</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esBatchWriteRetryWait</name>
+            <value>60s</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esBatchSizeEntries</name>
+            <value>200</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esNodesWanOnly</name>
+            <value>true</value>
+            <description>an ES configuration property</description>
+        </property>
        <property>
        	<name>maxIndexedEventsForDsAndTopic</name>
        	<description>the max number of events for each couple (ds/topic)</description>
@ -122,9 +142,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--index</arg><arg>${esNotificationsIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
+            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
+            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
+            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
+            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json
@ -0,0 +1,14 @@
+[
+	{
+		"paramName": "o",
+		"paramLongName": "outputDir",
+		"paramDescription": "the path where the data will be stored",
+		"paramRequired": true
+	},
+	{
+		"paramName": "list",
+		"paramLongName": "opendoarIds",
+		"paramDescription": "the opendoar IDs whitelist (comma separated)",
+		"paramRequired": true
+	}
+]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml
@ -0,0 +1,99 @@
+<workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>opendoarIds</name>
+            <description>the opendoar IDs whitelist (comma separated)</description>
+        </property>
+        <property>
+            <name>outputDir</name>
+            <description>the path where the the generated data will be stored</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="opendoarPartition"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+   <action name="opendoarPartition">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PartitionEventsByDsIdJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
+            <arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    	
+    <end name="End"/>
+
+</workflow-app>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml
@ -1,41 +1,38 @@
-<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">

    <parameters>
        <property>
-            <name>graphInputPath</name>
-            <description>the path where the graph is stored</description>
-        </property>
-        <property>
-            <name>workingPath</name>
+            <name>outputDir</name>
            <description>the path where the the generated data will be stored</description>
-        </property>
-		<property>
-            <name>datasourceIdWhitelist</name>
-            <value>-</value>
-            <description>a white list (comma separeted, - for empty list) of datasource ids</description>
-        </property>
-		<property>
-            <name>datasourceTypeWhitelist</name>
-            <value>-</value>
-            <description>a white list (comma separeted, - for empty list) of datasource types</description>
-        </property>
-		<property>
-            <name>datasourceIdBlacklist</name>
-            <value>-</value>
-            <description>a black list (comma separeted, - for empty list) of datasource ids</description>
        </property>
        <property>
            <name>esEventIndexName</name>
            <description>the elasticsearch index name for events</description>
        </property>
-        <property>
-            <name>esNotificationsIndexName</name>
-            <description>the elasticsearch index name for notifications</description>
-        </property>
        <property>
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
+        <property>
+            <name>esBatchWriteRetryCount</name>
+            <value>8</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esBatchWriteRetryWait</name>
+            <value>60s</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esBatchSizeEntries</name>
+            <value>200</value>
+            <description>an ES configuration property</description>
+        </property>
+		<property>
+            <name>esNodesWanOnly</name>
+            <value>true</value>
+            <description>an ES configuration property</description>
+        </property>
        <property>
        	<name>maxIndexedEventsForDsAndTopic</name>
        	<description>the max number of events for each couple (ds/topic)</description>
@ -44,18 +41,6 @@
        	<name>brokerApiBaseUrl</name>
        	<description>the url of the broker service api</description>
        </property>
-        <property>
-        	<name>brokerDbUrl</name>
-        	<description>the url of the broker database</description>
-        </property>
-        <property>
-        	<name>brokerDbUser</name>
-        	<description>the user of the broker database</description>
-        </property>
-        <property>
-        	<name>brokerDbPassword</name>
-        	<description>the password of the broker database</description>
-        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -111,36 +96,45 @@
        </configuration>
    </global>

-    <start to="partition"/>
+    <start to="index_event_subset"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    
-   <action name="partition">
+        
+     <action name="index_event_subset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>PartitionEventsByDsIdJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
+            <name>IndexEventSubsetOnESJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.dynamicAllocation.maxExecutors="8" 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
+            <arg>--index</arg><arg>${esEventIndexName}</arg>
+            <arg>--esHost</arg><arg>${esIndexHost}</arg>
+            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
+            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
+            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
+            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
+            <arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
+            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
-    	
+    
+   
+
    <end name="End"/>

 </workflow-app>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json
@ -1,8 +1,8 @@
 [
 	{
-		"paramName": "wp",
-		"paramLongName": "workingPath",
-		"paramDescription": "the working path",
+		"paramName": "o",
+		"paramLongName": "outputDir",
+		"paramDescription": "the path where generated data are stored",
 		"paramRequired": true
 	},
 	{
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -10,10 +10,11 @@ import java.io.Serializable;
 import java.nio.file.Paths;
 import java.util.*;

-import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;

+import com.fasterxml.jackson.databind.ObjectMapper;
+
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.pace.util.MapDocumentUtil;
@ -100,8 +101,8 @@ public class EntityMergerTest implements Serializable {
 		assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");

 		// verify authors
-		assertEquals(pub_merged.getAuthor().size(), 9);
-		assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
+		assertEquals(13, pub_merged.getAuthor().size());
+		assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));

 		// verify title
 		int count = 0;
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -7,7 +7,6 @@ import java.util.List;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
@ -16,8 +15,8 @@ import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
-import org.codehaus.jackson.map.ObjectMapper;

+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.hash.Hashing;

 import eu.dnetlib.dedup.graph.ConnectedComponent;
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -10,7 +10,8 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
-import org.codehaus.jackson.map.ObjectMapper;
+
+import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@ -14,7 +14,7 @@
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
-                <version>4.0.1</version>
+                <version>${net.alchim31.maven.version}</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
@ -51,7 +51,6 @@
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
-            <version>4.3.4</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
@ -84,6 +83,11 @@
            <artifactId>spark-sql_2.11</artifactId>
        </dependency>

+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+        </dependency>
+


    </dependencies>
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
-import org.codehaus.jackson.map.ObjectMapper
+import com.fasterxml.jackson.databind.ObjectMapper
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import org.slf4j.{Logger, LoggerFactory}

 import scala.collection.JavaConverters._
-import scala.io.Source


 case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
@ -19,23 +18,18 @@ case class HostedByItemType(id: String, officialname: String, issn: String, eiss
 case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}

 object DoiBoostMappingUtil {
-  def getUnknownCountry(): Qualifier = {
-    createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
-  }
-
-

  def generateMAGAffiliationId(affId: String): String = {
    s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
  }

-
  val logger: Logger = LoggerFactory.getLogger(getClass)

  //STATIC STRING
  val MAG = "microsoft"
  val MAG_NAME = "Microsoft Academic Graph"
-  val ORCID = "ORCID"
+  val ORCID = "orcid"
+  val ORCID_PENDING = "orcid_pending"
  val CROSSREF = "Crossref"
  val UNPAYWALL = "UnpayWall"
  val GRID_AC = "grid.ac"
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@ -39,33 +39,38 @@ object SparkGenerateDOIBoostActionSet {
    val dbaffiliationRelationPath   = parser.get("dbaffiliationRelationPath")
    val dbOrganizationPath          = parser.get("dbOrganizationPath")
    val workingDirPath              = parser.get("targetPath")
+    val sequenceFilePath            = parser.get("sFilePath")

-    spark.read.load(dbDatasetPath).as[OafDataset]
+    val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
      .map(d =>DoiBoostMappingUtil.fixResult(d))
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")

-    spark.read.load(dbPublicationPath).as[Publication]
+    val asPublication =spark.read.load(dbPublicationPath).as[Publication]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")

-    spark.read.load(dbOrganizationPath).as[Organization]
+    val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")


-    spark.read.load(crossRefRelation).as[Relation]
+    val asCRelation = spark.read.load(crossRefRelation).as[Relation]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")

-    spark.read.load(dbaffiliationRelationPath).as[Relation]
+    val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")


-    val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]

-    d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
+
+    val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
+
+//      spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
+
+    d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])



--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.oa.merge.AuthorMerger
+import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
@ -30,7 +31,7 @@ object SparkGenerateDoiBoost {
    import spark.implicits._

    val hostedByMapPath = parser.get("hostedByMapPath")
-    val workingDirPath = parser.get("workingDirPath")
+    val workingDirPath = parser.get("workingPath")


    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
@ -62,7 +63,7 @@ object SparkGenerateDoiBoost {
    val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
    fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")

-    logger.info("Phase 3) Join Result with MAG")
+    logger.info("Phase 4) Join Result with MAG")
    val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))

    val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
@ -132,7 +133,7 @@ object SparkGenerateDoiBoost {
          o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
        if (affiliation.OfficialPage.isDefined)
          o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
-        o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
+        o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
        o
      }
      else
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -14,7 +14,7 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.matching.Regex

-case class CrossrefDT(doi: String, json:String) {}
+case class CrossrefDT(doi: String, json:String, timestamp: Long) {}

 case class mappingAffiliation(name: String) {}

@ -200,7 +200,7 @@ case object Crossref2Oaf {
    a.setSurname(family)
    a.setFullname(s"$given $family")
    if (StringUtils.isNotBlank(orcid))
-      a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
+      a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava)

    a
  }
@ -248,7 +248,7 @@ case object Crossref2Oaf {


    def snsfRule(award:String): String = {
-      var tmp1 = StringUtils.substringAfter(award,"_")
+      val tmp1 = StringUtils.substringAfter(award,"_")
      val tmp2 = StringUtils.substringBefore(tmp1,"/")
      logger.debug(s"From $award to $tmp2")
      tmp2
@ -265,18 +265,20 @@ case object Crossref2Oaf {
    }


-    def generateRelation(sourceId:String, targetId:String, nsPrefix:String) :Relation = {
+    def generateRelation(sourceId:String, targetId:String, relClass:String) :Relation = {

      val r = new Relation
      r.setSource(sourceId)
-      r.setTarget(s"40|$nsPrefix::$targetId")
+      r.setTarget(targetId)
      r.setRelType("resultProject")
-      r.setRelClass("isProducedBy")
+      r.setRelClass(relClass)
      r.setSubRelType("outcome")
      r.setCollectedfrom(List(cf).asJava)
      r.setDataInfo(di)
      r.setLastupdatetimestamp(ts)
      r
+
+
    }


@ -284,12 +286,18 @@ case object Crossref2Oaf {
      if (funder.award.isDefined && funder.award.get.nonEmpty)
        funder.award.get.map(extractField).filter(a => a!= null &&  a.nonEmpty).foreach(
          award => {
-            val targetId = DHPUtils.md5(award)
-            queue += generateRelation(sourceId, targetId, nsPrefix)
+            val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
+            queue += generateRelation(sourceId, targetId , "isProducedBy")
+            queue += generateRelation(targetId , sourceId,  "produces")
          }
        )
    }

+    def getProjectId (nsPrefix:String, targetId:String):String = {
+      s"40|$nsPrefix::$targetId"
+    }
+
+
    if (funders != null)
    funders.foreach(funder => {
      if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
@ -310,22 +318,33 @@ case object Crossref2Oaf {
          case "10.13039/501100002341" =>   generateSimpleRelationFromAward(funder, "aka_________", a => a)
          case "10.13039/501100001602" =>   generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
          case "10.13039/501100000923" =>   generateSimpleRelationFromAward(funder, "arc_________", a => a)
-          case "10.13039/501100000038"=>    queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "nserc_______" )
-          case "10.13039/501100000155"=>    queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "sshrc_______" )
-          case "10.13039/501100000024"=>    queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "cihr________" )
+          case "10.13039/501100000038"=>    val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63")
+                                            queue += generateRelation(sourceId, targetId, "isProducedBy" )
+                                            queue += generateRelation(targetId, sourceId, "produces" )
+          case "10.13039/501100000155"=>    val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63")
+                                            queue += generateRelation(sourceId,targetId, "isProducedBy" )
+                                            queue += generateRelation(targetId,sourceId, "produces" )
+          case "10.13039/501100000024"=>    val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63")
+                                            queue += generateRelation(sourceId,targetId, "isProducedBy" )
+                                            queue += generateRelation(targetId,sourceId, "produces" )
          case "10.13039/501100002848" =>   generateSimpleRelationFromAward(funder, "conicytf____", a => a)
          case "10.13039/501100003448" =>   generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
          case "10.13039/501100010198" =>   generateSimpleRelationFromAward(funder, "sgov________", a=>a)
          case "10.13039/501100004564" =>   generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
          case "10.13039/501100003407" =>   generateSimpleRelationFromAward(funder, "miur________", a=>a)
-                                            queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "miur________" )
+                                            val targetId = getProjectId("miur________" , "1e5e62235d094afd01cd56e65112fc63")
+                                            queue += generateRelation(sourceId,targetId, "isProducedBy" )
+                                            queue += generateRelation(targetId,sourceId, "produces" )
          case "10.13039/501100006588" |
                "10.13039/501100004488" =>  generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
          case "10.13039/501100006769"=>    generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
          case "10.13039/501100001711"=>    generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
          case "10.13039/501100004410"=>    generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
          case "10.10.13039/100004440"=>    generateSimpleRelationFromAward(funder, "wt__________", a =>a)
-          case "10.13039/100004440"=>       queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
+          case "10.13039/100004440"=>       val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63")
+                                            queue += generateRelation(sourceId,targetId, "isProducedBy" )
+                                            queue += generateRelation(targetId,sourceId, "produces" )
+
          case _ =>                         logger.debug("no match for "+funder.DOI.get )


@ -341,7 +360,9 @@ case object Crossref2Oaf {
          case "The French National Research Agency (ANR)" |
               "The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
          case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
-          case "Wellcome Trust Masters Fellowship" => queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
+          case "Wellcome Trust Masters Fellowship" =>  val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
+                                                        queue +=  generateRelation(sourceId, targetId, "isProducedBy" )
+                                                        queue +=  generateRelation(targetId, sourceId, "produces" )
          case _ =>                         logger.debug("no match for "+funder.name )

        }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.crossref

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import org.apache.commons.io.IOUtils
+import org.apache.hadoop.io.{IntWritable, Text}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
@ -12,21 +13,23 @@ import org.slf4j.{Logger, LoggerFactory}

 object CrossrefDataset {

+  val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)

-  def extractTimestamp(input:String): Long = {
+
+  def to_item(input:String):CrossrefDT = {

    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input)
-
-    (json\"indexed"\"timestamp").extractOrElse[Long](0)
+    val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
+    val doi:String  = (json \ "DOI").extract[String]
+    CrossrefDT(doi, input, ts)

  }

-
  def main(args: Array[String]): Unit = {


-    val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
+
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
    parser.parseArgument(args)
@ -49,9 +52,8 @@ object CrossrefDataset {
        if (a == null)
          return b

-        val tb = extractTimestamp(b.json)
-        val ta = extractTimestamp(a.json)
-        if(ta >tb) {
+
+        if(a.timestamp >b.timestamp) {
          return a
        }
        b
@ -63,9 +65,7 @@ object CrossrefDataset {
        if (a == null)
          return b

-        val tb = extractTimestamp(b.json)
-        val ta = extractTimestamp(a.json)
-        if(ta >tb) {
+        if(a.timestamp >b.timestamp) {
          return a
        }
        b
@ -78,15 +78,21 @@ object CrossrefDataset {
      override def finish(reduction: CrossrefDT): CrossrefDT = reduction
    }

-    val sourcePath:String = parser.get("sourcePath")
-    val targetPath:String = parser.get("targetPath")
+    val workingPath:String = parser.get("workingPath")

-    val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT]

-    ds.groupByKey(_.doi)
+    val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
+
+
+    val update =
+      spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update",  classOf[IntWritable], classOf[Text])
+        .map(i =>CrossrefImporter.decompressBlob(i._2.toString))
+        .map(i =>to_item(i)))
+
+    main_ds.union(update).groupByKey(_.doi)
      .agg(crossrefAggregator.toColumn)
      .map(s=>s._2)
-      .write.mode(SaveMode.Overwrite).save(targetPath)
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")

  }

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java
@ -2,18 +2,16 @@
 package eu.dnetlib.doiboost.crossref;

 import java.io.ByteArrayOutputStream;
+import java.util.Optional;
 import java.util.zip.Inflater;

 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;

@ -30,34 +28,45 @@ public class CrossrefImporter {

 		parser.parseArgument(args);

-		final String hdfsuri = parser.get("namenode");
-		System.out.println("HDFS URI" + hdfsuri);
-		Path hdfswritepath = new Path(parser.get("targetPath"));
-		System.out.println("TargetPath: " + hdfsuri);
+		final String namenode = parser.get("namenode");
+		System.out.println("namenode: " + namenode);

-		final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
-			? Long.parseLong(parser.get("timestamp"))
-			: -1;
+		Path targetPath = new Path(parser.get("targetPath"));
+		System.out.println("targetPath: " + targetPath);

-		if (timestamp > 0)
-			System.out.println("Timestamp added " + timestamp);
+		final Long timestamp = Optional
+			.ofNullable(parser.get("timestamp"))
+			.map(s -> {
+				try {
+					return Long.parseLong(s);
+				} catch (NumberFormatException e) {
+					return -1L;
+				}
+			})
+			.orElse(-1L);
+		System.out.println("timestamp: " + timestamp);
+
+		final String esServer = parser.get("esServer");
+		System.out.println("esServer: " + esServer);
+
+		final String esIndex = parser.get("esIndex");
+		System.out.println("esIndex: " + esIndex);

 		// ====== Init HDFS File System Object
 		Configuration conf = new Configuration();
 		// Set FileSystem URI
-		conf.set("fs.defaultFS", hdfsuri);
+		conf.set("fs.defaultFS", namenode);
 		// Because of Maven
 		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

-		ESClient client = timestamp > 0
-			? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
-			: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
+		// "ip-90-147-167-25.ct1.garrservices.it", "crossref"
+		final ESClient client = new ESClient(esServer, esIndex, timestamp);

 		try (SequenceFile.Writer writer = SequenceFile
 			.createWriter(
 				conf,
-				SequenceFile.Writer.file(hdfswritepath),
+				SequenceFile.Writer.file(targetPath),
 				SequenceFile.Writer.keyClass(IntWritable.class),
 				SequenceFile.Writer.valueClass(Text.class))) {

@ -74,8 +83,7 @@ public class CrossrefImporter {
 					end = System.currentTimeMillis();
 					final float time = (end - start) / 1000.0F;
 					System.out
-						.println(
-							String.format("Imported %d records last 100000 imported in %f seconds", i, time));
+						.println(String.format("Imported %s records last 100000 imported in %s seconds", i, time));
 					start = System.currentTimeMillis();
 				}
 			}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java
@ -1,11 +1,11 @@

 package eu.dnetlib.doiboost.crossref;

-import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.commons.io.IOUtils;
+import org.apache.http.HttpHeaders;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpPost;
 import org.apache.http.entity.StringEntity;
@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory;
 import com.jayway.jsonpath.JsonPath;

 public class ESClient implements Iterator<String> {
-	private static final Logger logger = LoggerFactory.getLogger(ESClient.class);

-	static final String blobPath = "$.hits[*].hits[*]._source.blob";
-	static final String scrollIdPath = "$._scroll_id";
-	static final String JSON_NO_TS = "{\"size\":1000}";
-	static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
-	static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
+	private static final String BLOB_PATH = "$.hits.hits[*]._source.blob";
+	private static final String SCROLL_ID_PATH = "$._scroll_id";
+	private static final String JSON_NO_TS = "{\"size\":1000}";
+	private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
+	private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
+
+	public static final String APPLICATION_JSON = "application/json";
+
+	public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m";
+	public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll";

 	private final String scrollId;

@ -31,47 +35,30 @@ public class ESClient implements Iterator<String> {

 	private final String esHost;

-	public ESClient(final String esHost, final String esIndex) throws IOException {
-
+	public ESClient(final String esHost, final String esIndex, final long timestamp) {
 		this.esHost = esHost;
-		final String body = getResponse(
-			String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
-		scrollId = getJPathString(scrollIdPath, body);
-		buffer = getBlobs(body);
-	}

-	public ESClient(final String esHost, final String esIndex, final long timestamp)
-		throws IOException {
-		this.esHost = esHost;
-		final String body = getResponse(
-			String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
-			String.format(JSON_WITH_TS, timestamp));
-		scrollId = getJPathString(scrollIdPath, body);
+		final String body = timestamp > 0
+			? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp))
+			: getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS);
+		scrollId = getJPathString(SCROLL_ID_PATH, body);
 		buffer = getBlobs(body);
 	}

 	private String getResponse(final String url, final String json) {
-		CloseableHttpClient client = HttpClients.createDefault();
-		try {
-
+		try (CloseableHttpClient client = HttpClients.createDefault()) {
 			HttpPost httpPost = new HttpPost(url);
 			if (json != null) {
 				StringEntity entity = new StringEntity(json);
 				httpPost.setEntity(entity);
-				httpPost.setHeader("Accept", "application/json");
-				httpPost.setHeader("Content-type", "application/json");
+				httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON);
+				httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON);
+			}
+			try (CloseableHttpResponse response = client.execute(httpPost)) {
+				return IOUtils.toString(response.getEntity().getContent());
 			}
-			CloseableHttpResponse response = client.execute(httpPost);
-
-			return IOUtils.toString(response.getEntity().getContent());
 		} catch (Throwable e) {
 			throw new RuntimeException("Error on executing request ", e);
-		} finally {
-			try {
-				client.close();
-			} catch (IOException e) {
-				throw new RuntimeException("Unable to close client ", e);
-			}
 		}
 	}

@ -87,7 +74,7 @@ public class ESClient implements Iterator<String> {
 	}

 	private List<String> getBlobs(final String body) {
-		final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
+		final List<String> res = JsonPath.read(body, BLOB_PATH);
 		return res;
 	}

@ -102,11 +89,11 @@ public class ESClient implements Iterator<String> {
 		if (buffer.isEmpty()) {

 			final String json_param = String.format(JSON_SCROLL, scrollId);
-			final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
+			final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param);
 			try {
 				buffer = getBlobs(body);
 			} catch (Throwable e) {
-				logger.error("Error on  get next page: body:" + body);
+				System.out.println("Error on  get next page: body:" + body);
 			}
 		}
 		return nextItem;
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
@ -29,69 +29,26 @@ object SparkMapDumpIntoOAF {
        .appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()

+    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
    implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
    implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]

-    val sc = spark.sparkContext
    val targetPath = parser.get("targetPath")
+    import spark.implicits._

+    spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
+      .flatMap(k => Crossref2Oaf.convert(k.json))
+      .filter(o => o != null)
+      .write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")

-    sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
-      .map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
-      .flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
+    val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]

-    val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
+    ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")

-    val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
-      .map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
-      var r = if (p1 == null) p2 else p1
-      if (p1 != null && p2 != null) {
-        if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
-          if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
-            r = p2
-          else
-            r = p1
-        } else {
-          r = if (p1.getLastupdatetimestamp == null) p2 else p1
-        }
-      }
-      r
-    }.map(_._2)
+    ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")

-    val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
-    pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
-
-
-    val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
-      .map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
-      var r = if (p1 == null) p2 else p1
-      if (p1 != null && p2 != null) {
-        if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
-          if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
-            r = p2
-          else
-            r = p1
-        } else {
-          r = if (p1.getLastupdatetimestamp == null) p2 else p1
-        }
-      }
-      r
-    }.map(_._2)
-
-    spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
-
-
-
-    val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
-      .map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
-      .reduceByKey { case (p1: Relation, p2: Relation) =>
-        if (p1 == null) p2 else p1
-      }.map(_._2)
-
-    val rels: Dataset[Relation] = spark.createDataset(distinctRels)
-
-    rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
+    ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
  }


--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {


  val stream = Map(
-    "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
-    "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
-    "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
+    "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
+    "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
+    "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
+    "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
    "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
    "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
    "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
    "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
-    "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
-    "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
+    //                                                         ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
+    "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
+    "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
    "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
    "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
    "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
    "PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
    "PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
    "PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
-    "Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
+    "Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
    "RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
  )

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala
@ -11,7 +11,7 @@ import org.slf4j.{Logger, LoggerFactory}

 import scala.collection.JavaConverters._

-object SparkPreProcessMAG {
+object SparkProcessMAG {
  def main(args: Array[String]): Unit = {

    val logger: Logger = LoggerFactory.getLogger(getClass)
@ -26,12 +26,15 @@ object SparkPreProcessMAG {
        .master(parser.get("master")).getOrCreate()

    val sourcePath = parser.get("sourcePath")
+    val workingPath = parser.get("workingPath")
+    val targetPath = parser.get("targetPath")
+
    import spark.implicits._
    implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)

    logger.info("Phase 1) make uninque DOI in Papers:")
-    val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
+    val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]

    // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
    val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
      .map(_._2)

    val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
-    distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
+
+    distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")

    logger.info("Phase 0) Enrich Publication with description")
-    val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
-    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
+    val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
+    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")

    logger.info("Phase 3) Group Author by PaperId")
    val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
        } else
          mpa
      }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
-      .write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")

    logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")

    val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]

-    val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
+    val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]

-    val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
+    val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]

    val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
    firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
      .map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
-      .write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")


    var magPubs: Dataset[(String, Publication)] =
-      spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
+      spark.read.load(s"$workingPath/merge_step_2").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]


@ -95,10 +99,10 @@ object SparkPreProcessMAG {
      .map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
      .write
      .mode(SaveMode.Overwrite)
-      .save(s"${parser.get("targetPath")}/merge_step_2_conference")
+      .save(s"$workingPath/merge_step_2_conference")


-    magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
+    magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]

    val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
    magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
      .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
      .write.mode(SaveMode.Overwrite)
-      .save(s"${parser.get("targetPath")}/merge_step_3")
+      .save(s"$workingPath/merge_step_3")


 //    logger.info("Phase 6) Enrich Publication with description")
 //    val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
 //    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")

-    val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
+    val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]


-    magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
+    magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]

    magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
      .map(item => ConversionUtil.updatePubsWithDescription(item)
-    ).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
+    ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")


    logger.info("Phase 7) Enrich Publication with FieldOfStudy")

-    magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
+    magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]

    val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
      .equalTo(paperField("PaperId")), "left")
      .map(item => ConversionUtil.updatePubsWithSubject(item))
      .write.mode(SaveMode.Overwrite)
-      .save(s"${parser.get("targetPath")}/mag_publication")
+      .save(s"$workingPath/mag_publication")


-    val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
+    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
    .map(_._2)

-    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
+    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")

  }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
@ -17,11 +17,12 @@ import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.mortbay.log.Log;

-import eu.dnetlib.doiboost.orcid.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;

 public class ActivitiesDecompressor {

@ -143,4 +144,64 @@ public class ActivitiesDecompressor {
 		Log.info("Error from Orcid found: " + errorFromOrcidFound);
 		Log.info("Error parsing xml work found: " + xmlParserErrorFound);
 	}
+
+	public static void extractXML(Configuration conf, String inputUri, Path outputPath)
+		throws Exception {
+		String uri = inputUri;
+		FileSystem fs = FileSystem.get(URI.create(uri), conf);
+		Path inputPath = new Path(uri);
+		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+		CompressionCodec codec = factory.getCodec(inputPath);
+		if (codec == null) {
+			System.err.println("No codec found for " + uri);
+			System.exit(1);
+		}
+		CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
+		InputStream gzipInputStream = null;
+		try {
+			gzipInputStream = codec.createInputStream(fs.open(inputPath));
+			int counter = 0;
+			try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
+				TarArchiveEntry entry = null;
+				try (SequenceFile.Writer writer = SequenceFile
+					.createWriter(
+						conf,
+						SequenceFile.Writer.file(outputPath),
+						SequenceFile.Writer.keyClass(Text.class),
+						SequenceFile.Writer.valueClass(Text.class),
+						SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
+					while ((entry = tais.getNextTarEntry()) != null) {
+						String filename = entry.getName();
+						if (entry.isDirectory() || !filename.contains("works")) {
+						} else {
+							counter++;
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais));
+							String line;
+							StringBuffer buffer = new StringBuffer();
+							while ((line = br.readLine()) != null) {
+								buffer.append(line);
+							}
+							String xml = buffer.toString();
+							String[] filenameParts = filename.split("/");
+							final Text key = new Text(
+								XMLRecordParser
+									.retrieveOrcidIdFromActivity(
+										xml.getBytes(), filenameParts[filenameParts.length - 1]));
+							final Text value = new Text(xml);
+							writer.append(key, value);
+							if ((counter % 100000) == 0) {
+								Log.info("Current xml works extracted: " + counter);
+							}
+						}
+					}
+				}
+			}
+			Log.info("Activities extraction completed");
+			Log.info("Total XML works parsed: " + counter);
+		} finally {
+			Log.debug("Closing gzip stream");
+			IOUtils.closeStream(gzipInputStream);
+		}
+	}
+
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLActivitiesData.java
@ -0,0 +1,54 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.mortbay.log.Log;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
+
+public class ExtractXMLActivitiesData extends OrcidDSManager {
+	private String outputWorksPath;
+	private String activitiesFileNameTarGz;
+
+	public static void main(String[] args) throws IOException, Exception {
+		ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData();
+		extractXMLActivitiesData.loadArgs(args);
+		extractXMLActivitiesData.extractWorks();
+	}
+
+	private void loadArgs(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenOrcidAuthorWork.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
+		parser.parseArgument(args);
+
+		hdfsServerUri = parser.get("hdfsServerUri");
+		Log.info("HDFS URI: " + hdfsServerUri);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
+		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
+		Log.info("Activities File Name: " + activitiesFileNameTarGz);
+		outputWorksPath = parser.get("outputWorksPath");
+		Log.info("Output Author Work Data: " + outputWorksPath);
+	}
+
+	private void extractWorks() throws Exception {
+		Configuration conf = initConfigurationObject();
+		FileSystem fs = initFileSystemObject(conf);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
+		Path outputPath = new Path(
+			hdfsServerUri
+				.concat(workingPath)
+				.concat(outputWorksPath));
+		ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ExtractXMLSummariesData.java
@ -0,0 +1,56 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.mortbay.log.Log;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
+
+public class ExtractXMLSummariesData extends OrcidDSManager {
+
+	private String outputAuthorsPath;
+	private String summariesFileNameTarGz;
+
+	public static void main(String[] args) throws IOException, Exception {
+		ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData();
+		extractXMLSummariesData.loadArgs(args);
+		extractXMLSummariesData.extractAuthors();
+	}
+
+	private void loadArgs(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenOrcidAuthorWork.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
+		parser.parseArgument(args);
+
+		hdfsServerUri = parser.get("hdfsServerUri");
+		Log.info("HDFS URI: " + hdfsServerUri);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
+		summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
+		Log.info("Summaries File Name: " + summariesFileNameTarGz);
+		outputAuthorsPath = parser.get("outputAuthorsPath");
+		Log.info("Output Authors Data: " + outputAuthorsPath);
+	}
+
+	public void extractAuthors() throws Exception {
+		Configuration conf = initConfigurationObject();
+		FileSystem fs = initFileSystemObject(conf);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
+		Path outputPath = new Path(
+			hdfsServerUri
+				.concat(workingPath)
+				.concat(outputAuthorsPath)
+				.concat("xml_authors.seq"));
+		SummariesDecompressor.extractXML(conf, tarGzUri, outputPath);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
@ -1,10 +1,11 @@
 package eu.dnetlib.doiboost.orcid

-import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
+import eu.dnetlib.dhp.schema.orcid.OrcidDOI
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
 import org.apache.commons.lang.StringUtils
-import org.codehaus.jackson.map.ObjectMapper
 import org.slf4j.{Logger, LoggerFactory}

 import scala.collection.JavaConverters._
@ -17,7 +18,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err
 case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
 object ORCIDToOAF {
  val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
-  val mapper = new ObjectMapper
+  val mapper = new ObjectMapper()

  def isJsonValid(inputStr: String): Boolean = {
    import java.io.IOException
@ -43,16 +44,19 @@ object ORCIDToOAF {
  }


-  def convertTOOAF(input:ORCIDElement) :Publication = {
-    val doi = input.doi
+  def convertTOOAF(input:OrcidDOI) :Publication = {
+    val doi = input.getDoi
    val pub:Publication = new Publication
-    pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
+    pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
    pub.setDataInfo(generateDataInfo())
    pub.setId(generateIdentifier(pub, doi.toLowerCase))
    try{
-      pub.setAuthor(input.authors.map(a=> {
-        generateAuthor(a.name, a.surname, a.creditName, a.oid)
-      }).asJava)
+
+      val l:List[Author]= input.getAuthors.asScala.map(a=> {
+              generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
+            })(collection.breakOut)
+
+      pub.setAuthor(l.asJava)
      pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
      pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
      pub
@ -63,6 +67,13 @@ object ORCIDToOAF {
    }
  }

+  def generateOricPIDDatainfo():DataInfo = {
+    val di =DoiBoostMappingUtil.generateDataInfo("0.91")
+    di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
+    di.getProvenanceaction.setClassname("Harvested")
+    di
+  }
+
  def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
    val a = new Author
    a.setName(given)
@ -72,7 +83,7 @@ object ORCIDToOAF {
    else
      a.setFullname(s"$given $family")
    if (StringUtils.isNotBlank(orcid))
-      a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
+      a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)

    a
  }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java
@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
 	public void generateAuthorsDOIsData() throws Exception {
 		Configuration conf = initConfigurationObject();
 		FileSystem fs = initFileSystemObject(conf);
-		String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
-		Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
+		Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
 		ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
 	}

@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {

 		hdfsServerUri = parser.get("hdfsServerUri");
 		Log.info("HDFS URI: " + hdfsServerUri);
-		hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
-		Log.info("Default Path: " + hdfsOrcidDefaultPath);
+		workingPath = parser.get("workingPath");
+		Log.info("Default Path: " + workingPath);
 		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
 		Log.info("Activities File Name: " + activitiesFileNameTarGz);
 		outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class OrcidDSManager {

 	protected String hdfsServerUri;
-	protected String hdfsOrcidDefaultPath;
+	protected String workingPath;
 	private String summariesFileNameTarGz;
 	private String outputAuthorsPath;

@ -28,10 +28,10 @@ public class OrcidDSManager {
 	public void generateAuthors() throws Exception {
 		Configuration conf = initConfigurationObject();
 		FileSystem fs = initFileSystemObject(conf);
-		String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
 		Path outputPath = new Path(
 			hdfsServerUri
-				.concat(hdfsOrcidDefaultPath)
+				.concat(workingPath)
 				.concat(outputAuthorsPath)
 				.concat("authors.seq"));
 		SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
@ -41,22 +41,18 @@ public class OrcidDSManager {
 		// ====== Init HDFS File System Object
 		Configuration conf = new Configuration();
 		// Set FileSystem URI
-		conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
+		conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
 		// Because of Maven
 		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
 		return conf;
 	}

-	protected FileSystem initFileSystemObject(Configuration conf) {
+	protected FileSystem initFileSystemObject(Configuration conf) throws IOException {
 		// Get the filesystem - HDFS
+		// if there is an exception, it will be propagate
 		FileSystem fs = null;
-		try {
-			fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
-		} catch (IOException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		}
+		fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
 		return fs;
 	}

@ -66,13 +62,13 @@ public class OrcidDSManager {
 				.toString(
 					OrcidDSManager.class
 						.getResourceAsStream(
-							"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
+							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
 		parser.parseArgument(args);

 		hdfsServerUri = parser.get("hdfsServerUri");
 		Log.info("HDFS URI: " + hdfsServerUri);
-		hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
-		Log.info("Default Path: " + hdfsOrcidDefaultPath);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
 		summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
 		Log.info("Summaries File Name: " + summariesFileNameTarGz);
 		outputAuthorsPath = parser.get("outputAuthorsPath");
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java
@ -1,14 +1,15 @@

 package eu.dnetlib.doiboost.orcid;

-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.*;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.List;

+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@ -16,6 +17,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
@ -27,10 +29,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class OrcidDownloader extends OrcidDSManager {

 	static final int REQ_LIMIT = 24;
-//	static final int REQ_MAX_TEST = 100;
-	static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000;
+	static final int REQ_MAX_TEST = -1;
+	static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
 	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
-	static final String lastUpdate = "2019-09-30 00:00:00";
+	static final String lastUpdate = "2020-09-29 00:00:00";
 	private String lambdaFileName;
 	private String outputPath;
 	private String token;
@ -41,7 +43,7 @@ public class OrcidDownloader extends OrcidDSManager {
 		orcidDownloader.parseLambdaFile();
 	}

-	private String downloadRecord(String orcidId) {
+	private String downloadRecord(String orcidId) throws IOException {
 		try (CloseableHttpClient client = HttpClients.createDefault()) {
 			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
 			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
@ -49,17 +51,23 @@ public class OrcidDownloader extends OrcidDSManager {
 			CloseableHttpResponse response = client.execute(httpGet);
 			if (response.getStatusLine().getStatusCode() != 200) {
 				Log
-					.warn(
+					.info(
 						"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
 				return new String("");
 			}
-			return IOUtils.toString(response.getEntity().getContent());
-
-		} catch (Throwable e) {
-			Log.warn("Downloading " + orcidId, e.getMessage());
-
+//			return IOUtils.toString(response.getEntity().getContent());
+			return xmlStreamToString(response.getEntity().getContent());
 		}
-		return new String("");
+	}
+
+	private String xmlStreamToString(InputStream xmlStream) throws IOException {
+		BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
+		String line;
+		StringBuffer buffer = new StringBuffer();
+		while ((line = br.readLine()) != null) {
+			buffer.append(line);
+		}
+		return buffer.toString();
 	}

 	public void parseLambdaFile() throws Exception {
@ -69,97 +77,94 @@ public class OrcidDownloader extends OrcidDSManager {
 		long startDownload = 0;
 		Configuration conf = initConfigurationObject();
 		FileSystem fs = initFileSystemObject(conf);
-		String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
+		String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
 		Path hdfsreadpath = new Path(lambdaFileUri);
 		FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
 		Path hdfsoutputPath = new Path(
 			hdfsServerUri
-				.concat(hdfsOrcidDefaultPath)
+				.concat(workingPath)
 				.concat(outputPath)
-				.concat("orcid_records.seq"));
-
-		try (SequenceFile.Writer writer = SequenceFile
-			.createWriter(
-				conf,
-				SequenceFile.Writer.file(hdfsoutputPath),
-				SequenceFile.Writer.keyClass(Text.class),
-				SequenceFile.Writer.valueClass(Text.class))) {
-
-			try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) {
-				String line;
-				int nReqTmp = 0;
+				.concat("updated_xml_authors.seq"));
+		try (TarArchiveInputStream tais = new TarArchiveInputStream(
+			new GzipCompressorInputStream(lambdaFileStream))) {
+			TarArchiveEntry entry = null;
+			StringBuilder sb = new StringBuilder();
+			try (SequenceFile.Writer writer = SequenceFile
+				.createWriter(
+					conf,
+					SequenceFile.Writer.file(hdfsoutputPath),
+					SequenceFile.Writer.keyClass(Text.class),
+					SequenceFile.Writer.valueClass(Text.class),
+					SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
 				startDownload = System.currentTimeMillis();
-				long startReqTmp = System.currentTimeMillis();
-				while ((line = br.readLine()) != null) {
-					parsedRecordsCounter++;
-					// skip headers line
-					if (parsedRecordsCounter == 1) {
-						continue;
-					}
-					String[] values = line.split(",");
-					List<String> recordInfo = Arrays.asList(values);
-					String orcidId = recordInfo.get(0);
-					if (isModified(orcidId, recordInfo.get(3))) {
-						String record = downloadRecord(orcidId);
-						downloadedRecordsCounter++;
-						if (!record.isEmpty()) {
-							String compressRecord = ArgumentApplicationParser.compressArgument(record);
-							final Text key = new Text(recordInfo.get(0));
-							final Text value = new Text(compressRecord);
-
-							try {
+				while ((entry = tais.getNextTarEntry()) != null) {
+					BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
+					String line;
+					while ((line = br.readLine()) != null) {
+						String[] values = line.split(",");
+						List<String> recordInfo = Arrays.asList(values);
+						int nReqTmp = 0;
+						long startReqTmp = System.currentTimeMillis();
+						// skip headers line
+						if (parsedRecordsCounter == 0) {
+							parsedRecordsCounter++;
+							continue;
+						}
+						parsedRecordsCounter++;
+						String orcidId = recordInfo.get(0);
+						if (isModified(orcidId, recordInfo.get(3))) {
+							String record = downloadRecord(orcidId);
+							downloadedRecordsCounter++;
+							if (!record.isEmpty()) {
+//							String compressRecord = ArgumentApplicationParser.compressArgument(record);
+								final Text key = new Text(recordInfo.get(0));
+								final Text value = new Text(record);
 								writer.append(key, value);
 								savedRecordsCounter++;
-							} catch (IOException e) {
-								Log.warn("Writing to sequence file: " + e.getMessage());
-								Log.warn(e);
-								throw new RuntimeException(e);
+							}
+						} else {
+							break;
+						}
+						long endReq = System.currentTimeMillis();
+						nReqTmp++;
+						if (nReqTmp == REQ_LIMIT) {
+							long reqSessionDuration = endReq - startReqTmp;
+							if (reqSessionDuration <= 1000) {
+								Log
+									.info(
+										"\nreqSessionDuration: "
+											+ reqSessionDuration
+											+ " nReqTmp: "
+											+ nReqTmp
+											+ " wait ....");
+								Thread.sleep(1000 - reqSessionDuration);
+							} else {
+								nReqTmp = 0;
+								startReqTmp = System.currentTimeMillis();
+							}
+						}
+						if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
+							Log
+								.info(
+									"Current parsed: "
+										+ parsedRecordsCounter
+										+ " downloaded: "
+										+ downloadedRecordsCounter
+										+ " saved: "
+										+ savedRecordsCounter);
+							if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
+								break;
 							}
 						}
 					}
-					long endReq = System.currentTimeMillis();
-					nReqTmp++;
-					if (nReqTmp == REQ_LIMIT) {
-						long reqSessionDuration = endReq - startReqTmp;
-						if (reqSessionDuration <= 1000) {
-							Log
-								.warn(
-									"\nreqSessionDuration: "
-										+ reqSessionDuration
-										+ " nReqTmp: "
-										+ nReqTmp
-										+ " wait ....");
-							Thread.sleep(1000 - reqSessionDuration);
-						} else {
-							nReqTmp = 0;
-							startReqTmp = System.currentTimeMillis();
-						}
-					}
-
-//					if (parsedRecordsCounter > REQ_MAX_TEST) {
-//						break;
-//					}
-					if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
-						Log
-							.info(
-								"Current parsed: "
-									+ parsedRecordsCounter
-									+ " downloaded: "
-									+ downloadedRecordsCounter
-									+ " saved: "
-									+ savedRecordsCounter);
-//						if (parsedRecordsCounter > REQ_MAX_TEST) {
-//							break;
-//						}
-					}
+					long endDownload = System.currentTimeMillis();
+					long downloadTime = endDownload - startDownload;
+					Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
 				}
-				long endDownload = System.currentTimeMillis();
-				long downloadTime = endDownload - startDownload;
-				Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
 			}
 		}
-		lambdaFileStream.close();
 		Log.info("Download started at: " + new Date(startDownload).toString());
+		Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
 		Log.info("Parsed Records Counter: " + parsedRecordsCounter);
 		Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
 		Log.info("Saved Records Counter: " + savedRecordsCounter);
@ -176,8 +181,8 @@ public class OrcidDownloader extends OrcidDSManager {

 		hdfsServerUri = parser.get("hdfsServerUri");
 		Log.info("HDFS URI: " + hdfsServerUri);
-		hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
-		Log.info("Default Path: " + hdfsOrcidDefaultPath);
+		workingPath = parser.get("workingPath");
+		Log.info("Default Path: " + workingPath);
 		lambdaFileName = parser.get("lambdaFileName");
 		Log.info("Lambda File Name: " + lambdaFileName);
 		outputPath = parser.get("outputPath");
@ -185,7 +190,7 @@ public class OrcidDownloader extends OrcidDSManager {
 		token = parser.get("token");
 	}

-	private boolean isModified(String orcidId, String modifiedDate) {
+	public boolean isModified(String orcidId, String modifiedDate) {
 		Date modifiedDateDt = null;
 		Date lastUpdateDt = null;
 		try {
@ -195,7 +200,7 @@ public class OrcidDownloader extends OrcidDSManager {
 			modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
 			lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
 		} catch (Exception e) {
-			Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
+			Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
 			return true;
 		}
 		return modifiedDateDt.after(lastUpdateDt);
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
@ -1,21 +1,72 @@
 package eu.dnetlib.doiboost.orcid

+import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.oaf.Publication
+import eu.dnetlib.dhp.schema.orcid.OrcidDOI
 import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}

 object SparkConvertORCIDToOAF {
+  val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
+
+  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
+
+    override def zero: Publication = new Publication()
+
+    override def reduce(b: Publication, a: (String, Publication)): Publication = {
+      b.mergeFrom(a._2)
+      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
+      if (b.getId == null)
+        b.setId(a._2.getId)
+      b
+    }


+    override def merge(wx: Publication, wy: Publication): Publication = {
+      wx.mergeFrom(wy)
+      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
+      if(wx.getId == null && wy.getId.nonEmpty)
+        wx.setId(wy.getId)
+      wx
+    }
+    override def finish(reduction: Publication): Publication = reduction
+
+    override def bufferEncoder: Encoder[Publication] =
+      Encoders.kryo(classOf[Publication])
+
+    override def outputEncoder: Encoder[Publication] =
+      Encoders.kryo(classOf[Publication])
+  }
+
+def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
+  implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
+  implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
+  implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
+
+  val mapper = new ObjectMapper()
+  mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
+
+  val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
+
+  logger.info("Converting ORCID to OAF")
+  dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
+    .map(d => (d.getId, d))
+    .groupByKey(_._1)(Encoders.STRING)
+    .agg(getPublicationAggregator().toColumn)
+    .map(p => p._2)
+    .write.mode(SaveMode.Overwrite).save(targetPath)
+}

  def main(args: Array[String]): Unit = {

-    val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
+
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
    parser.parseArgument(args)
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()

-    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
-    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
-    import spark.implicits._
+
+
    val sourcePath = parser.get("sourcePath")
    val targetPath = parser.get("targetPath")
-    val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
+    run(spark, sourcePath, targetPath)

-
-    logger.info("Converting ORCID to OAF")
-    val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
-      .map(_._2)
-
-    spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
  }

 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
@ -0,0 +1,184 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.util.LongAccumulator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
+import scala.Tuple2;
+
+public class SparkDownloadOrcidAuthors {
+
+	static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
+	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
+	static final String lastUpdate = "2020-09-29 00:00:00";
+
+	public static void main(String[] args) throws Exception {
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkDownloadOrcidAuthors.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+		final String workingPath = parser.get("workingPath");
+		logger.info("workingPath: {}", workingPath);
+		final String outputPath = parser.get("outputPath");
+		logger.info("outputPath: {}", outputPath);
+		final String token = parser.get("token");
+		final String lambdaFileName = parser.get("lambdaFileName");
+		logger.info("lambdaFileName: {}", lambdaFileName);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+				LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
+				LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
+				LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
+				LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
+				LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
+				LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
+				LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
+				LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
+
+				logger.info("Retrieving data from lamda sequence file");
+				JavaPairRDD<Text, Text> lamdaFileRDD = sc
+					.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
+				logger.info("Data retrieved: " + lamdaFileRDD.count());
+
+				Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
+					String orcidId = data._1().toString();
+					String lastModifiedDate = data._2().toString();
+					parsedRecordsAcc.add(1);
+					if (isModified(orcidId, lastModifiedDate)) {
+						modifiedRecordsAcc.add(1);
+						return true;
+					}
+					return false;
+				};
+
+				Function<Tuple2<Text, Text>, Tuple2<String, String>> downloadRecordFunction = data -> {
+					String orcidId = data._1().toString();
+					String lastModifiedDate = data._2().toString();
+					final DownloadedRecordData downloaded = new DownloadedRecordData();
+					downloaded.setOrcidId(orcidId);
+					downloaded.setLastModifiedDate(lastModifiedDate);
+					try (CloseableHttpClient client = HttpClients.createDefault()) {
+						HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
+						httpGet.addHeader("Accept", "application/vnd.orcid+xml");
+						httpGet.addHeader("Authorization", String.format("Bearer %s", token));
+						long startReq = System.currentTimeMillis();
+						CloseableHttpResponse response = client.execute(httpGet);
+						long endReq = System.currentTimeMillis();
+						long reqTime = endReq - startReq;
+						if (reqTime < 1000) {
+							Thread.sleep(1000 - reqTime);
+						}
+						int statusCode = response.getStatusLine().getStatusCode();
+						downloaded.setStatusCode(statusCode);
+						if (statusCode != 200) {
+							switch (statusCode) {
+								case 403:
+									errorHTTP403Acc.add(1);
+								case 409:
+									errorHTTP409Acc.add(1);
+								case 503:
+									errorHTTP503Acc.add(1);
+									throw new RuntimeException("Orcid request rate limit reached (HTTP 503)");
+								case 525:
+									errorHTTP525Acc.add(1);
+								default:
+									errorHTTPGenericAcc.add(1);
+									logger
+										.info(
+											"Downloading " + orcidId + " status code: "
+												+ response.getStatusLine().getStatusCode());
+							}
+							return downloaded.toTuple2();
+						}
+						downloadedRecordsAcc.add(1);
+						downloaded
+							.setCompressedData(
+								ArgumentApplicationParser
+									.compressArgument(IOUtils.toString(response.getEntity().getContent())));
+					} catch (Throwable e) {
+						logger.info("Downloading " + orcidId, e.getMessage());
+						downloaded.setErrorMessage(e.getMessage());
+						return downloaded.toTuple2();
+					}
+					return downloaded.toTuple2();
+				};
+
+				sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
+
+				logger.info("Start execution ...");
+				JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
+				logger.info("Authors modified count: " + authorsModifiedRDD.count());
+				logger.info("Start downloading ...");
+				authorsModifiedRDD
+					.repartition(10)
+					.map(downloadRecordFunction)
+					.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
+					.saveAsNewAPIHadoopFile(
+						workingPath.concat(outputPath),
+						Text.class,
+						Text.class,
+						SequenceFileOutputFormat.class,
+						sc.hadoopConfiguration());
+				logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
+				logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
+				logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
+				logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
+				logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
+				logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
+				logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
+				logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
+			});
+
+	}
+
+	private static boolean isModified(String orcidId, String modifiedDate) {
+		Date modifiedDateDt;
+		Date lastUpdateDt;
+		try {
+			if (modifiedDate.length() != 19) {
+				modifiedDate = modifiedDate.substring(0, 19);
+			}
+			modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
+			lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
+		} catch (Exception e) {
+			logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
+			return true;
+		}
+		return modifiedDateDt.after(lastUpdateDt);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenLastModifiedSeq.java
@ -0,0 +1,99 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.mortbay.log.Log;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class SparkGenLastModifiedSeq {
+	private static String hdfsServerUri;
+	private static String workingPath;
+	private static String outputPath;
+	private static String lambdaFileName;
+
+	public static void main(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkGenLastModifiedSeq.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		hdfsServerUri = parser.get("hdfsServerUri");
+		workingPath = parser.get("workingPath");
+		outputPath = parser.get("outputPath");
+		lambdaFileName = parser.get("lambdaFileName");
+		String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
+
+		SparkConf sparkConf = new SparkConf();
+		runWithSparkSession(
+			sparkConf,
+			isSparkSessionManaged,
+			spark -> {
+				int rowsNum = 0;
+				Path output = new Path(
+					hdfsServerUri
+						.concat(workingPath)
+						.concat(outputPath));
+				Path hdfsreadpath = new Path(lambdaFileUri);
+				Configuration conf = new Configuration();
+				conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
+				conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+				conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+				FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
+				FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
+				try (TarArchiveInputStream tais = new TarArchiveInputStream(
+					new GzipCompressorInputStream(lambdaFileStream))) {
+					TarArchiveEntry entry = null;
+					try (SequenceFile.Writer writer = SequenceFile
+						.createWriter(
+							conf,
+							SequenceFile.Writer.file(output),
+							SequenceFile.Writer.keyClass(Text.class),
+							SequenceFile.Writer.valueClass(Text.class),
+							SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
+						while ((entry = tais.getNextTarEntry()) != null) {
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais));
+							String line;
+							while ((line = br.readLine()) != null) {
+								String[] values = line.split(",");
+								List<String> recordInfo = Arrays.asList(values);
+								String orcidId = recordInfo.get(0);
+								final Text key = new Text(orcidId);
+								final Text value = new Text(recordInfo.get(3));
+								writer.append(key, value);
+								rowsNum++;
+							}
+						}
+					}
+				}
+				Log.info("Saved rows from lamda csv tar file: " + rowsNum);
+			});
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
@ -13,9 +13,6 @@ import java.util.stream.Stream;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@ -33,7 +30,7 @@ import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import scala.Tuple2;

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkOrcidGenerateAuthors.java
@ -1,165 +0,0 @@
-
-package eu.dnetlib.doiboost.orcid;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.List;
-import java.util.Optional;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.impl.client.HttpClients;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.util.LongAccumulator;
-import org.mortbay.log.Log;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
-import scala.Tuple2;
-
-public class SparkOrcidGenerateAuthors {
-
-	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
-	static final String lastUpdate = "2019-09-30 00:00:00";
-
-	public static void main(String[] args) throws IOException, Exception {
-		Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
-		logger.info("[ SparkOrcidGenerateAuthors STARTED]");
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkOrcidGenerateAuthors.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
-		parser.parseArgument(args);
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
-		logger.info("workingPath: ", workingPath);
-		final String outputAuthorsPath = parser.get("outputAuthorsPath");
-		logger.info("outputAuthorsPath: ", outputAuthorsPath);
-		final String token = parser.get("token");
-
-		SparkConf conf = new SparkConf();
-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-
-				LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords");
-				LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords");
-				LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords");
-				LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords");
-				JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "lamdafiles");
-
-				JavaRDD<String> downloadedRDD = sc.textFile(workingPath + "downloaded");
-				Function<String, String> getOrcidIdFunction = line -> {
-					try {
-						String[] values = line.split(",");
-						return values[0].substring(1);
-					} catch (Exception e) {
-						return new String("");
-					}
-				};
-				List<String> downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect();
-
-				Function<String, Boolean> isModifiedAfterFilter = line -> {
-					String[] values = line.split(",");
-					String orcidId = values[0];
-					parsedRecordsAcc.add(1);
-					if (isModified(orcidId, values[3])) {
-						modifiedRecordsAcc.add(1);
-						return true;
-					}
-					return false;
-				};
-				Function<String, Boolean> isNotDownloadedFilter = line -> {
-					String[] values = line.split(",");
-					String orcidId = values[0];
-					if (downloadedRecords.contains(orcidId)) {
-						alreadyDownloadedRecords.add(1);
-						return false;
-					}
-					return true;
-				};
-				Function<String, Tuple2<String, String>> downloadRecordFunction = line -> {
-					String[] values = line.split(",");
-					String orcidId = values[0];
-					String modifiedDate = values[3];
-					return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc);
-				};
-
-				lamdaFileRDD
-					.filter(isModifiedAfterFilter)
-					.filter(isNotDownloadedFilter)
-					.map(downloadRecordFunction)
-					.rdd()
-					.saveAsTextFile(workingPath.concat(outputAuthorsPath));
-			});
-
-	}
-
-	private static boolean isModified(String orcidId, String modifiedDate) {
-		Date modifiedDateDt = null;
-		Date lastUpdateDt = null;
-		try {
-			if (modifiedDate.length() != 19) {
-				modifiedDate = modifiedDate.substring(0, 19);
-			}
-			modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
-			lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
-		} catch (Exception e) {
-			Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
-			return true;
-		}
-		return modifiedDateDt.after(lastUpdateDt);
-	}
-
-	private static Tuple2<String, String> downloadRecord(String orcidId, String modifiedDate, String token,
-		LongAccumulator downloadedRecordsAcc) {
-		final DownloadedRecordData data = new DownloadedRecordData();
-		data.setOrcidId(orcidId);
-		data.setModifiedDate(modifiedDate);
-		try (CloseableHttpClient client = HttpClients.createDefault()) {
-			HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
-			httpGet.addHeader("Accept", "application/vnd.orcid+xml");
-			httpGet.addHeader("Authorization", String.format("Bearer %s", token));
-			CloseableHttpResponse response = client.execute(httpGet);
-			int statusCode = response.getStatusLine().getStatusCode();
-			data.setStatusCode(statusCode);
-			if (statusCode != 200) {
-				Log
-					.warn(
-						"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
-				return data.toTuple2();
-			}
-			downloadedRecordsAcc.add(1);
-			data
-				.setCompressedData(
-					ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent())));
-		} catch (Throwable e) {
-			Log.warn("Downloading " + orcidId, e.getMessage());
-			data.setErrorMessage(e.getMessage());
-			return data.toTuple2();
-		}
-		return data.toTuple2();
-	}
-}
--- a/Show More
+++ b/Show More