Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

2021-02-04 10:44:21 +01:00 · 2021-02-04 10:44:21 +01:00 · c238561001
parent 465ce39f75 c67329d3ad
commit c238561001
182 changed files with 12366 additions and 732 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
@ -0,0 +1,30 @@
 package eu.dnetlib.dhp.common;
 import java.util.Map;
 import com.google.common.collect.Maps;
 public class Constants {
 	public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
 	public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
 	public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
 	static {
 		accessRightsCoarMap.put("OPEN", "c_abf2");
 		accessRightsCoarMap.put("RESTRICTED", "c_16ec");
 		accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
 		accessRightsCoarMap.put("CLOSED", "c_14cb");
 		accessRightsCoarMap.put("EMBARGO", "c_f1cf");
 	}
 	static {
 		coarCodeLabelMap.put("c_abf2", "OPEN");
 		coarCodeLabelMap.put("c_16ec", "RESTRICTED");
 		coarCodeLabelMap.put("c_14cb", "CLOSED");
 		coarCodeLabelMap.put("c_f1cf", "EMBARGO");
 	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/GraphResultMapper.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/GraphResultMapper.java
@ -0,0 +1,412 @@
 package eu.dnetlib.dhp.common;
 import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.dump.oaf.*;
 import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
 import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Journal;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 public class GraphResultMapper implements Serializable {
 	public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
 		E in) {
 		CommunityResult out = new CommunityResult();
 		eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
 		Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
 		if (ort.isPresent()) {
 			switch (ort.get().getClassid()) {
 				case "publication":
 					Optional<Journal> journal = Optional
 						.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
 					if (journal.isPresent()) {
 						Journal j = journal.get();
 						Container c = new Container();
 						c.setConferencedate(j.getConferencedate());
 						c.setConferenceplace(j.getConferenceplace());
 						c.setEdition(j.getEdition());
 						c.setEp(j.getEp());
 						c.setIss(j.getIss());
 						c.setIssnLinking(j.getIssnLinking());
 						c.setIssnOnline(j.getIssnOnline());
 						c.setIssnPrinted(j.getIssnPrinted());
 						c.setName(j.getName());
 						c.setSp(j.getSp());
 						c.setVol(j.getVol());
 						out.setContainer(c);
 						out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
 					}
 					break;
 				case "dataset":
 					eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
 					Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
 					Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
 					out
 						.setGeolocation(
 							Optional
 								.ofNullable(id.getGeolocation())
 								.map(
 									igl -> igl
 										.stream()
 										.filter(Objects::nonNull)
 										.map(gli -> {
 											GeoLocation gl = new GeoLocation();
 											gl.setBox(gli.getBox());
 											gl.setPlace(gli.getPlace());
 											gl.setPoint(gli.getPoint());
 											return gl;
 										})
 										.collect(Collectors.toList()))
 								.orElse(null));
 					out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
 					break;
 				case "software":
 					eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
 					Optional
 						.ofNullable(is.getCodeRepositoryUrl())
 						.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
 					Optional
 						.ofNullable(is.getDocumentationUrl())
 						.ifPresent(
 							value -> out
 								.setDocumentationUrl(
 									value
 										.stream()
 										.map(v -> v.getValue())
 										.collect(Collectors.toList())));
 					Optional
 						.ofNullable(is.getProgrammingLanguage())
 						.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
 					out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
 					break;
 				case "other":
 					eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
 					out
 						.setContactgroup(
 							Optional
 								.ofNullable(ir.getContactgroup())
 								.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
 								.orElse(null));
 					out
 						.setContactperson(
 							Optional
 								.ofNullable(ir.getContactperson())
 								.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
 								.orElse(null));
 					out
 						.setTool(
 							Optional
 								.ofNullable(ir.getTool())
 								.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
 								.orElse(null));
 					out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
 					break;
 			}
 			Optional
 				.ofNullable(input.getAuthor())
 				.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
 			// I do not map Access Right UNKNOWN or OTHER
 			Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
 			if (oar.isPresent()) {
 				if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
 					String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
 					out
 						.setBestaccessright(
 							AccessRight
 								.newInstance(
 									code,
 									Constants.coarCodeLabelMap.get(code),
 									Constants.COAR_ACCESS_RIGHT_SCHEMA));
 				}
 			}
 			final List<String> contributorList = new ArrayList<>();
 			Optional
 				.ofNullable(input.getContributor())
 				.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
 			out.setContributor(contributorList);
 			Optional
 				.ofNullable(input.getCountry())
 				.ifPresent(
 					value -> out
 						.setCountry(
 							value
 								.stream()
 								.map(
 									c -> {
 										if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
 											return null;
 										}
 										Country country = new Country();
 										country.setCode(c.getClassid());
 										country.setLabel(c.getClassname());
 										Optional
 											.ofNullable(c.getDataInfo())
 											.ifPresent(
 												provenance -> country
 													.setProvenance(
 														Provenance
 															.newInstance(
 																provenance
 																	.getProvenanceaction()
 																	.getClassname(),
 																c.getDataInfo().getTrust())));
 										return country;
 									})
 								.filter(Objects::nonNull)
 								.collect(Collectors.toList())));
 			final List<String> coverageList = new ArrayList<>();
 			Optional
 				.ofNullable(input.getCoverage())
 				.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
 			out.setCoverage(coverageList);
 			out.setDateofcollection(input.getDateofcollection());
 			final List<String> descriptionList = new ArrayList<>();
 			Optional
 				.ofNullable(input.getDescription())
 				.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
 			out.setDescription(descriptionList);
 			Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
 			if (oStr.isPresent()) {
 				out.setEmbargoenddate(oStr.get().getValue());
 			}
 			final List<String> formatList = new ArrayList<>();
 			Optional
 				.ofNullable(input.getFormat())
 				.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
 			out.setFormat(formatList);
 			out.setId(input.getId());
 			out.setOriginalId(input.getOriginalId());
 			Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
 				.ofNullable(input.getInstance());
 			if (oInst.isPresent()) {
 				out
 					.setInstance(
 						oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
 			}
 			Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
 			if (oL.isPresent()) {
 				eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
 				out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
 			}
 			Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
 			if (oLong.isPresent()) {
 				out.setLastupdatetimestamp(oLong.get());
 			}
 			Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
 			if (otitle.isPresent()) {
 				List<StructuredProperty> iTitle = otitle
 					.get()
 					.stream()
 					.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
 					.collect(Collectors.toList());
 				if (iTitle.size() > 0) {
 					out.setMaintitle(iTitle.get(0).getValue());
 				}
 				iTitle = otitle
 					.get()
 					.stream()
 					.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
 					.collect(Collectors.toList());
 				if (iTitle.size() > 0) {
 					out.setSubtitle(iTitle.get(0).getValue());
 				}
 			}
 			List<ControlledField> pids = new ArrayList<>();
 			Optional
 				.ofNullable(input.getPid())
 				.ifPresent(
 					value -> value
 						.stream()
 						.forEach(
 							p -> pids
 								.add(
 									ControlledField
 										.newInstance(p.getQualifier().getClassid(), p.getValue()))));
 			out.setPid(pids);
 			oStr = Optional.ofNullable(input.getDateofacceptance());
 			if (oStr.isPresent()) {
 				out.setPublicationdate(oStr.get().getValue());
 			}
 			oStr = Optional.ofNullable(input.getPublisher());
 			if (oStr.isPresent()) {
 				out.setPublisher(oStr.get().getValue());
 			}
 			List<String> sourceList = new ArrayList<>();
 			Optional
 				.ofNullable(input.getSource())
 				.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
 			// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
 			List<Subject> subjectList = new ArrayList<>();
 			Optional
 				.ofNullable(input.getSubject())
 				.ifPresent(
 					value -> value
 						.forEach(s -> subjectList.add(getSubject(s))));
 			out.setSubjects(subjectList);
 			out.setType(input.getResulttype().getClassid());
 		}
 		out
 			.setCollectedfrom(
 				input
 					.getCollectedfrom()
 					.stream()
 					.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
 					.collect(Collectors.toList()));
 		return out;
 	}
 	private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
 		CommunityInstance instance = new CommunityInstance();
 		setCommonValue(i, instance);
 		instance
 			.setCollectedfrom(
 				KeyValue
 					.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
 		instance
 			.setHostedby(
 				KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
 		return instance;
 	}
 	private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
 		Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
 			.ofNullable(i.getAccessright());
 		if (opAr.isPresent()) {
 			if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
 				String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
 				instance
 					.setAccessright(
 						AccessRight
 							.newInstance(
 								code,
 								Constants.coarCodeLabelMap.get(code),
 								Constants.COAR_ACCESS_RIGHT_SCHEMA));
 			}
 		}
 		Optional
 			.ofNullable(i.getLicense())
 			.ifPresent(value -> instance.setLicense(value.getValue()));
 		Optional
 			.ofNullable(i.getDateofacceptance())
 			.ifPresent(value -> instance.setPublicationdate(value.getValue()));
 		Optional
 			.ofNullable(i.getRefereed())
 			.ifPresent(value -> instance.setRefereed(value.getClassname()));
 		Optional
 			.ofNullable(i.getInstancetype())
 			.ifPresent(value -> instance.setType(value.getClassname()));
 		Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
 	}
 	private static Subject getSubject(StructuredProperty s) {
 		Subject subject = new Subject();
 		subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
 		Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
 		if (di.isPresent()) {
 			Provenance p = new Provenance();
 			p.setProvenance(di.get().getProvenanceaction().getClassname());
 			p.setTrust(di.get().getTrust());
 			subject.setProvenance(p);
 		}
 		return subject;
 	}
 	private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
 		Author a = new Author();
 		a.setFullname(oa.getFullname());
 		a.setName(oa.getName());
 		a.setSurname(oa.getSurname());
 		a.setRank(oa.getRank());
 		Optional<List<StructuredProperty>> oPids = Optional
 			.ofNullable(oa.getPid());
 		if (oPids.isPresent()) {
 			Pid pid = getOrcid(oPids.get());
 			if (pid != null) {
 				a.setPid(pid);
 			}
 		}
 		return a;
 	}
 	private static Pid getOrcid(List<StructuredProperty> p) {
 		for (StructuredProperty pid : p) {
 			if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
 				Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
 				if (di.isPresent()) {
 					return Pid
 						.newInstance(
 							ControlledField
 								.newInstance(
 									pid.getQualifier().getClassid(),
 									pid.getValue()),
 							Provenance
 								.newInstance(
 									di.get().getProvenanceaction().getClassname(),
 									di.get().getTrust()));
 				} else {
 					return Pid
 						.newInstance(
 							ControlledField
 								.newInstance(
 									pid.getQualifier().getClassid(),
 									pid.getValue())
 						);
 				}
 			}
 		}
 		return null;
 	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
@ -90,9 +90,6 @@ public class MakeTarArchive implements Serializable {
 		String p_string = p.toString();
 		if (!p_string.endsWith("_SUCCESS")) {
 			String name = p_string.substring(p_string.lastIndexOf("/") + 1);
 			if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
 				name = "communities_infrastructures.json";
 			}
 			TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
 			entry.setSize(fileStatus.getLen());
 			current_size += fileStatus.getLen();
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.lang3.StringUtils;
@ -32,27 +33,33 @@ public class AuthorMerger {
 	}
-	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
 		int pa = countAuthorsPids(a);
 		int pb = countAuthorsPids(b);
 		List<Author> base, enrich;
 		int sa = authorsSize(a);
 		int sb = authorsSize(b);
-		if (pa == pb) {
+		if (sa == sb) {
 			base = sa > sb ? a : b;
 			enrich = sa > sb ? b : a;
 		} else {
 			base = pa > pb ? a : b;
 			enrich = pa > pb ? b : a;
 		} else {
 			base = sa > sb ? a : b;
 			enrich = sa > sb ? b : a;
 		}
-		enrichPidFromList(base, enrich);
+		enrichPidFromList(base, enrich, threshold);
 		return base;
 	}
-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
 		return mergeAuthor(a, b, THRESHOLD);
 	}
 	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
 		if (base == null || enrich == null)
 			return;
 		// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
 		final Map<String, Author> basePidAuthorMap = base
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -63,6 +70,7 @@ public class AuthorMerger {
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
 		// <pid, Author> (list of pid that are missing in the other list)
 		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -83,10 +91,10 @@ public class AuthorMerger {
 						.max(Comparator.comparing(Tuple2::_1));
 					if (simAuthor.isPresent()) {
-						double th = THRESHOLD;
+						double th = threshold;
 						// increase the threshold if the surname is too short
 						if (simAuthor.get()._2().getSurname() != null
-							&& simAuthor.get()._2().getSurname().length() <= 3)
+							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
 							th = 0.99;
 						if (simAuthor.get()._1() > th) {
@ -156,7 +164,7 @@ public class AuthorMerger {
 	}
 	private static String normalize(final String s) {
-		return nfd(s)
+		String[] normalized = nfd(s)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
 			// in case
@ -166,7 +174,12 @@ public class AuthorMerger {
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
-			.trim();
+			.trim()
 			.split(" ");
 		Arrays.sort(normalized);
 		return String.join(" ", normalized);
 	}
 	private static String nfd(final String s) {
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
@ -0,0 +1,100 @@
 package eu.dnetlib.dhp.oa.merge;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import scala.Tuple2;
 public class AuthorMergerTest {
 	private String publicationsBasePath;
 	private List<List<Author>> authors;
 	@BeforeEach
 	public void setUp() throws Exception {
 		publicationsBasePath = Paths
 			.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
 			.toFile()
 			.getAbsolutePath();
 		authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
 			.stream()
 			.map(p -> p._2().getAuthor())
 			.collect(Collectors.toList());
 	}
 	@Test
 	public void mergeTest() { // used in the dedup: threshold set to 0.95
 		for (List<Author> authors1 : authors) {
 			System.out.println("List " + (authors.indexOf(authors1) + 1));
 			for (Author author : authors1) {
 				System.out.println(authorToString(author));
 			}
 		}
 		List<Author> merge = AuthorMerger.merge(authors);
 		System.out.println("Merge ");
 		for (Author author : merge) {
 			System.out.println(authorToString(author));
 		}
 		Assertions.assertEquals(7, merge.size());
 	}
 	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
 		List<Tuple2<String, T>> res = new ArrayList<>();
 		BufferedReader reader;
 		try {
 			reader = new BufferedReader(new FileReader(path));
 			String line = reader.readLine();
 			while (line != null) {
 				res
 					.add(
 						new Tuple2<>(
 							MapDocumentUtil.getJPathString("$.id", line),
 							new ObjectMapper().readValue(line, clazz)));
 				// read next line
 				line = reader.readLine();
 			}
 			reader.close();
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 		return res;
 	}
 	public String authorToString(Author a) {
 		String print = "Fullname = ";
 		print += a.getFullname() + " pid = [";
 		if (a.getPid() != null)
 			for (StructuredProperty sp : a.getPid()) {
 				print += sp.toComparableString() + " ";
 			}
 		print += "]";
 		return print;
 	}
 }
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
@ -105,6 +105,8 @@ public class ModelConstants {
 	public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
 		"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
 	public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE);
 	private static Qualifier qualifier(
 		final String classid,
 		final String classname,
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
@ -2,8 +2,12 @@
 package eu.dnetlib.dhp.schema.oaf;
 import java.io.Serializable;
 import java.util.Collection;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 public abstract class Oaf implements Serializable {
@ -40,9 +44,34 @@ public abstract class Oaf implements Serializable {
 		this.lastupdatetimestamp = lastupdatetimestamp;
 	}
-	public void mergeOAFDataInfo(Oaf e) {
+	public void mergeFrom(Oaf o) {
-		if (e.getDataInfo() != null && compareTrust(this, e) < 0)
+		if (Objects.isNull(o)) {
-			dataInfo = e.getDataInfo();
+			return;
 		}
 		setCollectedfrom(
 			Stream
 				.concat(
 					Optional
 						.ofNullable(getCollectedfrom())
 						.map(Collection::stream)
 						.orElse(Stream.empty()),
 					Optional
 						.ofNullable(o.getCollectedfrom())
 						.map(Collection::stream)
 						.orElse(Stream.empty()))
 				.distinct() // relies on KeyValue.equals
 				.collect(Collectors.toList()));
 		setLastupdatetimestamp(
 			Math
 				.max(
 					Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
 					Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
 	}
 	public void mergeOAFDataInfo(Oaf o) {
 		if (o.getDataInfo() != null && compareTrust(this, o) < 0)
 			dataInfo = o.getDataInfo();
 	}
 	protected String extractTrust(Oaf e) {
@ -62,7 +91,7 @@ public abstract class Oaf implements Serializable {
 		if (o == null || getClass() != o.getClass())
 			return false;
 		Oaf oaf = (Oaf) o;
-		return Objects.equals(dataInfo, oaf.dataInfo)
+		return Objects.equals(getDataInfo(), oaf.getDataInfo())
 			&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
 	}
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java
@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
 	}
 	public void mergeFrom(OafEntity e) {
-
+		super.mergeFrom(e);
 		if (e == null)
 			return;
 		originalId = mergeLists(originalId, e.getOriginalId());
 		collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
 		pid = mergeLists(pid, e.getPid());
 		if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java
@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
 			? p.getFundedamount()
 			: fundedamount;
 		// programme = mergeLists(programme, p.getProgramme());
 		h2020classification = mergeLists(h2020classification, p.getH2020classification());
 		mergeOAFDataInfo(e);
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
@ -130,19 +130,7 @@ public class Relation extends Oaf {
 			Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
 		checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
-		setCollectedfrom(
+		super.mergeFrom(r);
 			Stream
 				.concat(
 					Optional
 						.ofNullable(getCollectedfrom())
 						.map(Collection::stream)
 						.orElse(Stream.empty()),
 					Optional
 						.ofNullable(r.getCollectedfrom())
 						.map(Collection::stream)
 						.orElse(Stream.empty()))
 				.distinct() // relies on KeyValue.equals
 				.collect(Collectors.toList()));
 	}
 	@Override
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
@ -243,7 +243,7 @@ public class Result extends OafEntity implements Serializable {
 		Result r = (Result) e;
-		// TODO consider merging also Measures
+		measures = mergeLists(measures, r.getMeasures());
 		instance = mergeLists(instance, r.getInstance());
@ -323,13 +323,13 @@ public class Result extends OafEntity implements Serializable {
 		if (a.size() == b.size()) {
 			int msa = a
 				.stream()
-				.filter(i -> i.getValue() != null)
+				.filter(i -> i != null && i.getValue() != null)
 				.map(i -> i.getValue().length())
 				.max(Comparator.naturalOrder())
 				.orElse(0);
 			int msb = b
 				.stream()
-				.filter(i -> i.getValue() != null)
+				.filter(i -> i != null && i.getValue() != null)
 				.map(i -> i.getValue().length())
 				.max(Comparator.naturalOrder())
 				.orElse(0);
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipDeserialize.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipDeserialize.java
@ -0,0 +1,28 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 /**
 * Class that maps the model of the bipFinder! input data.
 * Only needed for deserialization purposes
 */
 public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
 	public BipDeserialize() {
 		super();
 	}
 	public List<Score> get(String key) {
 		if (super.get(key) == null) {
 			return new ArrayList<>();
 		}
 		return super.get(key);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipScore.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/BipScore.java
@ -0,0 +1,30 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.Serializable;
 import java.util.List;
 /**
 * Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
 */
 public class BipScore implements Serializable {
 	private String id; // doi
 	private List<Score> scoreList; // unit as given in the inputfile
 	public String getId() {
 		return id;
 	}
 	public void setId(String id) {
 		this.id = id;
 	}
 	public List<Score> getScoreList() {
 		return scoreList;
 	}
 	public void setScoreList(List<Score> scoreList) {
 		this.scoreList = scoreList;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java
@ -0,0 +1,85 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.oaf.Result;
 /**
 * Just collects all the atomic actions produced for the different results and saves them in
 * outputpath for the ActionSet
 */
 public class CollectAndSave implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static <I extends Result> void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				CollectAndSave.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath {}: ", inputPath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}: ", outputPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, outputPath);
 				collectAndSave(spark, inputPath, outputPath);
 			});
 	}
 	private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
 		JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		sc
 			.sequenceFile(inputPath + "/publication", Text.class, Text.class)
 			.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
 			.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
 			.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
 			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
 		;
 	}
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/KeyValue.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/KeyValue.java
@ -0,0 +1,26 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.Serializable;
 public class KeyValue implements Serializable {
 	private String key;
 	private String value;
 	public String getKey() {
 		return key;
 	}
 	public void setKey(String key) {
 		this.key = key;
 	}
 	public String getValue() {
 		return value;
 	}
 	public void setValue(String value) {
 		this.value = value;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
@ -0,0 +1,28 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.Serializable;
 /**
 * Subset of the information of the generic results that are needed to create the atomic action
 */
 public class PreparedResult implements Serializable {
 	private String id; // openaire id
 	private String value; // doi
 	public String getId() {
 		return id;
 	}
 	public void setId(String id) {
 		this.id = id;
 	}
 	public String getValue() {
 		return value;
 	}
 	public void setValue(String value) {
 		this.value = value;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Score.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Score.java
@ -0,0 +1,30 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.Serializable;
 import java.util.List;
 /**
 * represents the score in the input file
 */
 public class Score implements Serializable {
 	private String id;
 	private List<KeyValue> unit;
 	public String getId() {
 		return id;
 	}
 	public void setId(String id) {
 		this.id = id;
 	}
 	public List<KeyValue> getUnit() {
 		return unit;
 	}
 	public void setUnit(List<KeyValue> unit) {
 		this.unit = unit;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -0,0 +1,200 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import scala.Tuple2;
 /**
 * created the Atomic Action for each tipe of results
 */
 public class SparkAtomicActionScoreJob implements Serializable {
 	private static String DOI = "doi";
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static <I extends Result> void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				SparkAtomicActionScoreJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath {}: ", inputPath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}: ", outputPath);
 		final String bipScorePath = parser.get("bipScorePath");
 		log.info("bipScorePath: {}", bipScorePath);
 		final String resultClassName = parser.get("resultTableName");
 		log.info("resultTableName: {}", resultClassName);
 		Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, outputPath);
 				prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
 			});
 	}
 	private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
 		String bipScorePath, Class<I> inputClazz) {
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
 			.textFile(bipScorePath)
 			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
 		Dataset<BipScore> bipScores = spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
 				BipScore bs = new BipScore();
 				bs.setId(key);
 				bs.setScoreList(entry.get(key));
 				return bs;
 			}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
 		System.out.println(bipScores.count());
 		Dataset<I> results = readPath(spark, inputPath, inputClazz);
 		results.createOrReplaceTempView("result");
 		Dataset<PreparedResult> preparedResult = spark
 			.sql(
 				"select pIde.value value, id " +
 					"from result " +
 					"lateral view explode (pid) p as pIde " +
 					"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
 			.as(Encoders.bean(PreparedResult.class));
 		bipScores
 			.joinWith(
 				preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
 				"inner")
 			.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
 				BipScore ret = value._1();
 				ret.setId(value._2().getId());
 				return ret;
 			}, Encoders.bean(BipScore.class))
 			.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
 				Result ret = new Result();
 				ret.setDataInfo(getDataInfo());
 				BipScore first = it.next();
 				ret.setId(first.getId());
 				ret.setMeasures(getMeasure(first));
 				it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
 				return ret;
 			}, Encoders.bean(Result.class))
 			.toJavaRDD()
 			.map(p -> new AtomicAction(inputClazz, p))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
 	}
 	private static List<Measure> getMeasure(BipScore value) {
 		return value
 			.getScoreList()
 			.stream()
 			.map(score -> {
 				Measure m = new Measure();
 				m.setId(score.getId());
 				m
 					.setUnit(
 						score
 							.getUnit()
 							.stream()
 							.map(unit -> {
 								KeyValue kv = new KeyValue();
 								kv.setValue(unit.getValue());
 								kv.setKey(unit.getKey());
 								kv.setDataInfo(getDataInfo());
 								return kv;
 							})
 							.collect(Collectors.toList()));
 				return m;
 			})
 			.collect(Collectors.toList());
 	}
 	private static DataInfo getDataInfo() {
 		DataInfo di = new DataInfo();
 		di.setInferred(false);
 		di.setInvisible(false);
 		di.setDeletedbyinference(false);
 		di.setTrust("");
 		Qualifier qualifier = new Qualifier();
 		qualifier.setClassid("sysimport:actionset");
 		qualifier.setClassname("Harvested");
 		qualifier.setSchemename("dnet:provenanceActions");
 		qualifier.setSchemeid("dnet:provenanceActions");
 		di.setProvenanceaction(qualifier);
 		return di;
 	}
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
 	public static <R> Dataset<R> readPath(
 		SparkSession spark, String inputPath, Class<R> clazz) {
 		return spark
 			.read()
 			.textFile(inputPath)
 			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@ -0,0 +1,20 @@
 [
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "ip",
    "paramLongName": "inputPath",
    "paramDescription": "the URL from where to get the programme file",
    "paramRequired": true
  },
  {
    "paramName": "o",
    "paramLongName": "outputPath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json
@ -0,0 +1,32 @@
 [
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "ip",
    "paramLongName": "inputPath",
    "paramDescription": "the URL from where to get the programme file",
    "paramRequired": true
  },
  {
    "paramName": "o",
    "paramLongName": "outputPath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  },
  {
    "paramName": "rtn",
    "paramLongName": "resultTableName",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  },
  {
    "paramName": "bsp",
    "paramLongName": "bipScorePath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
@ -0,0 +1,171 @@
 <workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>inputPath</name>
            <description>the input path of the resources to be extended</description>
        </property>
        <property>
            <name>bipScorePath</name>
            <description>the path where to find the bipFinder scores</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
    </parameters>
    <start to="deleteoutputpath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="deleteoutputpath">
        <fs>
            <delete path="${outputPath}"/>
            <mkdir path="${outputPath}"/>
            <delete path="${workingDir}"/>
            <mkdir path="${workingDir}"/>
        </fs>
        <ok to="atomicactions"/>
        <error to="Kill"/>
    </action>
    <fork name="atomicactions">
        <path start="atomicactions_publication"/>
        <path start="atomicactions_dataset"/>
        <path start="atomicactions_orp"/>
        <path start="atomicactions_software"/>
    </fork>
    <action name="atomicactions_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the atomic action with the bip finder scores for publications</name>
            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/publication</arg>
            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
        </spark>
        <ok to="join_aa"/>
        <error to="Kill"/>
    </action>
    <action name="atomicactions_dataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the atomic action with the bip finder scores for datasets</name>
            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
        </spark>
        <ok to="join_aa"/>
        <error to="Kill"/>
    </action>
    <action name="atomicactions_orp">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the atomic action with the bip finder scores for orp</name>
            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
        </spark>
        <ok to="join_aa"/>
        <error to="Kill"/>
    </action>
    <action name="atomicactions_software">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the atomic action with the bip finder scores for software</name>
            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--outputPath</arg><arg>${workingDir}/software</arg>
            <arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
        </spark>
        <ok to="join_aa"/>
        <error to="Kill"/>
    </action>
    <join name="join_aa" to="collectandsave"/>
    <action name="collectandsave">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>saves all the aa produced for the several types of results in the as output path</name>
            <class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${workingDir}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@ -0,0 +1,323 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 public class SparkAtomicActionScoreJobTest {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	private static final Logger log = LoggerFactory
 		.getLogger(SparkAtomicActionScoreJobTest.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files
 			.createTempDirectory(SparkAtomicActionScoreJobTest.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(SparkAtomicActionScoreJobTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(SparkAtomicActionScoreJobTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	public void matchOne() throws Exception {
 		String bipScoresPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
 			.getPath();
 		SparkAtomicActionScoreJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-bipScorePath",
 					bipScoresPath,
 					"-resultTableName",
 					"eu.dnetlib.dhp.schema.oaf.Publication",
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Publication> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Publication) aa.getPayload()));
 		Assertions.assertTrue(tmp.count() == 1);
 		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
 		verificationDataset.createOrReplaceTempView("publication");
 		Dataset<Row> execVerification = spark
 			.sql(
 				"Select p.id oaid, mes.id, mUnit.value from publication p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
 		Assertions.assertEquals(2, execVerification.count());
 		Assertions
 			.assertEquals(
 				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
 				execVerification.select("oaid").collectAsList().get(0).getString(0));
 		Assertions
 			.assertEquals(
 				"1.47565045883e-08",
 				execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
 		Assertions
 			.assertEquals(
 				"0.227515392",
 				execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
 	}
 	@Test
 	public void matchOneWithTwo() throws Exception {
 		String bipScoresPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
 			.getPath();
 		SparkAtomicActionScoreJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-bipScorePath",
 					bipScoresPath,
 					"-resultTableName",
 					"eu.dnetlib.dhp.schema.oaf.Publication",
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Publication> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Publication) aa.getPayload()));
 		Assertions.assertTrue(tmp.count() == 1);
 		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
 		verificationDataset.createOrReplaceTempView("publication");
 		Dataset<Row> execVerification = spark
 			.sql(
 				"Select p.id oaid, mes.id, mUnit.value from publication p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
 		Assertions.assertEquals(4, execVerification.count());
 		Assertions
 			.assertEquals(
 				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
 				execVerification.select("oaid").collectAsList().get(0).getString(0));
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'influence'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'popularity'").count());
 		List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
 		String tmp_influence = tmp_ds.get(0).getString(0);
 		Assertions
 			.assertTrue(
 				"1.47565045883e-08".equals(tmp_influence) ||
 					"1.98956540239e-08".equals(tmp_influence));
 		tmp_influence = tmp_ds.get(1).getString(0);
 		Assertions
 			.assertTrue(
 				"1.47565045883e-08".equals(tmp_influence) ||
 					"1.98956540239e-08".equals(tmp_influence));
 		Assertions.assertTrue(!tmp_ds.get(0).getString(0).equals(tmp_ds.get(1).getString(0)));
 	}
 	@Test
 	public void matchTwo() throws Exception {
 		String bipScoresPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
 			.getPath();
 		SparkAtomicActionScoreJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-bipScorePath",
 					bipScoresPath,
 					"-resultTableName",
 					"eu.dnetlib.dhp.schema.oaf.Publication",
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Publication> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Publication) aa.getPayload()));
 		Assertions.assertTrue(tmp.count() == 2);
 		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
 		verificationDataset.createOrReplaceTempView("publication");
 		Dataset<Row> execVerification = spark
 			.sql(
 				"Select p.id oaid, mes.id, mUnit.value from publication p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
 		Assertions.assertEquals(4, execVerification.count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'influence'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'popularity'").count());
 		Assertions
 			.assertEquals(
 				"1.47565045883e-08",
 				execVerification
 					.filter(
 						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
 							"and id = 'influence'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 		Assertions
 			.assertEquals(
 				"1.98956540239e-08",
 				execVerification
 					.filter(
 						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
 							"and id = 'influence'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 		Assertions
 			.assertEquals(
 				"0.282046161584",
 				execVerification
 					.filter(
 						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
 							"and id = 'popularity'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 		Assertions
 			.assertEquals(
 				"0.227515392",
 				execVerification
 					.filter(
 						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
 							"and id = 'popularity'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java
@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
 			IOUtils
 				.toString(
 					CheckDuplictedIdsJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
+						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
 		parser.parseArgument(args);
 		final SparkConf conf = new SparkConf();
-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
-		final String countPath = parser.get("workingPath") + "/counts";
+		final String countPath = parser.get("outputDir") + "/counts";
 		log.info("countPath: {}", countPath);
 		final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
 			.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(countPath);
 		;
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java
@ -44,10 +44,10 @@ public class GenerateEventsJob {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String eventsPath = workingPath + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
@ -59,6 +59,9 @@ public class GenerateEventsJob {
 		final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
 		log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
 		final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
 		log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -70,12 +73,12 @@ public class GenerateEventsJob {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
 			final Dataset<ResultGroup> groups = ClusterUtils
-				.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
+				.readPath(spark, workingDir + "/duplicates", ResultGroup.class);
 			final Dataset<Event> dataset = groups
 				.map(
 					g -> EventFinder
-						.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
+						.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
 					Encoders
 						.bean(EventGroup.class))
 				.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java
@ -46,7 +46,7 @@ public class GenerateStatsJob {
 		final SparkConf conf = new SparkConf();
-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String dbUrl = parser.get("dbUrl");
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java
@ -46,7 +46,7 @@ public class IndexEventSubsetJob {
 		final SparkConf conf = new SparkConf();
-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String index = parser.get("index");
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);
 		final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
 		log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
 		final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
 		log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
 		final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
 		log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
 		final String esNodesWanOnly = parser.get("esNodesWanOnly");
 		log.info("esNodesWanOnly: {}", esNodesWanOnly);
 		final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
 		log.info("maxEventsForTopic: {}", maxEventsForTopic);
@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
 		esCfg.put("es.index.auto.create", "false");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
-		esCfg.put("es.batch.write.retry.count", "8");
+		esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
-		esCfg.put("es.batch.write.retry.wait", "60s");
+		esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
-		esCfg.put("es.batch.size.entries", "200");
+		esCfg.put("es.batch.size.entries", esBatchSizeEntries);
-		esCfg.put("es.nodes.wan.only", "true");
+		esCfg.put("es.nodes.wan.only", esNodesWanOnly);
 		log.info("*** Start indexing");
 		JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java
@ -54,7 +54,7 @@ public class IndexNotificationsJob {
 		final SparkConf conf = new SparkConf();
-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String index = parser.get("index");
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);
 		final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
 		log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
 		final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
 		log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
 		final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
 		log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
 		final String esNodesWanOnly = parser.get("esNodesWanOnly");
 		log.info("esNodesWanOnly: {}", esNodesWanOnly);
 		final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
 		log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
@ -92,10 +104,10 @@ public class IndexNotificationsJob {
 			esCfg.put("es.index.auto.create", "false");
 			esCfg.put("es.nodes", indexHost);
 			esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
-			esCfg.put("es.batch.write.retry.count", "8");
+			esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
-			esCfg.put("es.batch.write.retry.wait", "60s");
+			esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
-			esCfg.put("es.batch.size.entries", "200");
+			esCfg.put("es.batch.size.entries", esBatchSizeEntries);
-			esCfg.put("es.nodes.wan.only", "true");
+			esCfg.put("es.nodes.wan.only", esNodesWanOnly);
 			log.info("*** Start indexing");
 			JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
@ -36,7 +36,7 @@ public class IndexOnESJob {
 		final SparkConf conf = new SparkConf();
-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String index = parser.get("index");
@ -45,6 +45,18 @@ public class IndexOnESJob {
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);
 		final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
 		log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
 		final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
 		log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
 		final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
 		log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
 		final String esNodesWanOnly = parser.get("esNodesWanOnly");
 		log.info("esNodesWanOnly: {}", esNodesWanOnly);
 		final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
 		final JavaRDD<String> inputRdd = ClusterUtils
@ -53,15 +65,13 @@ public class IndexOnESJob {
 			.javaRDD();
 		final Map<String, String> esCfg = new HashMap<>();
 		// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
 		esCfg.put("es.index.auto.create", "false");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
-		esCfg.put("es.batch.write.retry.count", "8");
+		esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
-		esCfg.put("es.batch.write.retry.wait", "60s");
+		esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
-		esCfg.put("es.batch.size.entries", "200");
+		esCfg.put("es.batch.size.entries", esBatchSizeEntries);
-		esCfg.put("es.nodes.wan.only", "true");
+		esCfg.put("es.nodes.wan.only", esNodesWanOnly);
 		JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java
@ -42,10 +42,10 @@ public class JoinStep0Job {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
 		final SparkConf conf = new SparkConf();
@ -57,10 +57,10 @@ public class JoinStep0Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);
 			final Dataset<RelatedDatasource> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
+				.readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java
@ -40,10 +40,10 @@ public class JoinStep1Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
 		final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep1Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);
 			final Dataset<RelatedProject> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
+				.readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java
@ -39,10 +39,10 @@ public class JoinStep2Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
 		final SparkConf conf = new SparkConf();
@ -54,10 +54,10 @@ public class JoinStep2Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);
 			final Dataset<RelatedSoftware> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
+				.readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java
@ -40,10 +40,10 @@ public class JoinStep3Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
 		final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep3Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);
 			final Dataset<RelatedDataset> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
+				.readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java
@ -40,10 +40,10 @@ public class JoinStep4Job {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
+		final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
 		final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep4Job {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);
 			final Dataset<RelatedPublication> typedRels = ClusterUtils
-				.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
+				.readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
 				.toColumn();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java
@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.slf4j.Logger;
@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
 			IOUtils
 				.toString(
 					PartitionEventsByDsIdJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
+						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
@ -48,24 +55,43 @@ public class PartitionEventsByDsIdJob {
 		final SparkConf conf = new SparkConf();
-		final String eventsPath = parser.get("workingPath") + "/events";
+		final String eventsPath = parser.get("outputDir") + "/events";
 		log.info("eventsPath: {}", eventsPath);
-		final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
+		final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
 		log.info("partitionPath: {}", partitionPath);
 		final String opendoarIds = parser.get("opendoarIds");
 		log.info("opendoarIds: {}", opendoarIds);
 		final Set<String> validOpendoarIds = new HashSet<>();
 		if (!opendoarIds.trim().equals("-")) {
 			validOpendoarIds
 				.addAll(
 					Arrays
 						.stream(opendoarIds.split(","))
 						.map(String::trim)
 						.filter(StringUtils::isNotBlank)
 						.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
 						.collect(Collectors.toSet()));
 		}
 		log.info("validOpendoarIds: {}", validOpendoarIds);
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			ClusterUtils
 				.readPath(spark, eventsPath, Event.class)
-				.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
+				.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
-				.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
+				.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
-				.limit(10000)
+				.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
-				.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
+				.map(
 					(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
 					Encoders.bean(ShortEventMessageWithGroupId.class))
 				.coalesce(1)
 				.write()
 				.partitionBy("group")
 				.mode(SaveMode.Overwrite)
 				.option("compression", "gzip")
 				.json(partitionPath);
 		});
@ -97,6 +123,7 @@ public class PartitionEventsByDsIdJob {
 		final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
 		res.setEventId(e.getEventId());
 		res.setOriginalId(payload.getResult().getOriginalId());
 		res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
 		res.setTopic(e.getTopic());
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java
@ -45,10 +45,10 @@ public class PrepareGroupsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String groupsPath = workingPath + "/duplicates";
+		final String groupsPath = workingDir + "/duplicates";
 		log.info("groupsPath: {}", groupsPath);
 		final SparkConf conf = new SparkConf();
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
 			final Dataset<OaBrokerMainEntity> results = ClusterUtils
-				.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
+				.readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);
 			final Dataset<Relation> mergedRels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
 			final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String relsPath = workingPath + "/relatedDatasets";
+		final String relsPath = workingDir + "/relatedDatasets";
 		log.info("relsPath: {}", relsPath);
 		final SparkConf conf = new SparkConf();
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
 				.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
 				.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
 			final Dataset<RelatedDataset> dataset = rels
 				.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(t -> {
-					final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
+					final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
 						t._2);
 					rel.getRelDataset().setRelType(t._1.getRelClass());
 					return rel;
 				}, Encoders.bean(RelatedDataset.class));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java
@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String relsPath = workingPath + "/relatedDatasources";
+		final String relsPath = workingDir + "/relatedDatasources";
 		log.info("relsPath: {}", relsPath);
 		final SparkConf conf = new SparkConf();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String relsPath = workingPath + "/relatedProjects";
+		final String relsPath = workingDir + "/relatedProjects";
 		log.info("relsPath: {}", relsPath);
 		final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
 				.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
 				.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String relsPath = workingPath + "/relatedPublications";
+		final String relsPath = workingDir + "/relatedPublications";
 		log.info("relsPath: {}", relsPath);
 		final SparkConf conf = new SparkConf();
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
 					Encoders.bean(OaBrokerRelatedPublication.class));
 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
 				.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
 			final Dataset<RelatedPublication> dataset = rels
 				.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(t -> {
-					final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
+					final RelatedPublication rel = new RelatedPublication(
 						t._1.getSource(), t._2);
 					rel.getRelPublication().setRelType(t._1.getRelClass());
 					return rel;
 				}, Encoders.bean(RelatedPublication.class));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String relsPath = workingPath + "/relatedSoftwares";
+		final String relsPath = workingDir + "/relatedSoftwares";
 		log.info("relsPath: {}", relsPath);
 		final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
 				.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
 			final Dataset<Relation> rels = ClusterUtils
-				.readPath(spark, graphPath + "/relation", Relation.class)
+				.loadRelations(graphPath, spark)
 				.filter(r -> r.getDataInfo().getDeletedbyinference())
 				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
 				.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
-		final String workingPath = parser.get("workingPath");
+		final String workingDir = parser.get("workingDir");
-		log.info("workingPath: {}", workingPath);
+		log.info("workingDir: {}", workingDir);
-		final String simpleEntitiesPath = workingPath + "/simpleEntities";
+		final String simpleEntitiesPath = workingDir + "/simpleEntities";
 		log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
 		final SparkConf conf = new SparkConf();
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java
@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
 	public EnrichMissingSubject() {
 		super(20,
-			s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
+			s -> {
 				switch (s.getType().toLowerCase()) {
 					case "acm":
 						return Topic.ENRICH_MISSING_SUBJECT_ACM;
 					case "arxiv":
 						return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
 					case "ddc":
 						return Topic.ENRICH_MISSING_SUBJECT_DDC;
 					case "jel":
 						return Topic.ENRICH_MISSING_SUBJECT_JEL;
 					case "mesh":
 						return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
 					case "rvk":
 						return Topic.ENRICH_MISSING_SUBJECT_RVK;
 					default:
 						return null;
 				}
 			},
 			(p, s) -> p.getSubjects().add(s),
 			s -> subjectAsString(s));
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java
@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
 	public EnrichMoreSubject() {
 		super(20,
-			s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
+			s -> {
 				switch (s.getType().toLowerCase()) {
 					case "acm":
 						return Topic.ENRICH_MORE_SUBJECT_ACM;
 					case "arxiv":
 						return Topic.ENRICH_MORE_SUBJECT_ARXIV;
 					case "ddc":
 						return Topic.ENRICH_MORE_SUBJECT_DDC;
 					case "jel":
 						return Topic.ENRICH_MORE_SUBJECT_JEL;
 					case "mesh":
 						return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
 					case "rvk":
 						return Topic.ENRICH_MORE_SUBJECT_RVK;
 					default:
 						return null;
 				}
 			},
 			(p, s) -> p.getSubjects().add(s),
 			s -> subjectAsString(s));
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class ClusterUtils {
@ -30,6 +31,16 @@ public class ClusterUtils {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
 	public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
 		return ClusterUtils
 			.readPath(spark, graphPath + "/relation", Relation.class)
 			.map(r -> {
 				r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
 				r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
 				return r;
 			}, Encoders.bean(Relation.class));
 	}
 	public static <R> Dataset<R> readPath(
 		final SparkSession spark,
 		final String inputPath,
@ -67,6 +78,7 @@ public class ClusterUtils {
 			.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(path);
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
@ -74,7 +74,7 @@ public class ConversionUtils {
 		}
 		final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
-		res.setOpenaireId(d.getId());
+		res.setOpenaireId(cleanOpenaireId(d.getId()));
 		res.setOriginalId(first(d.getOriginalId()));
 		res.setTitle(structPropValue(d.getTitle()));
 		res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -89,7 +89,7 @@ public class ConversionUtils {
 		}
 		final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
-		res.setOpenaireId(p.getId());
+		res.setOpenaireId(cleanOpenaireId(p.getId()));
 		res.setOriginalId(first(p.getOriginalId()));
 		res.setTitle(structPropValue(p.getTitle()));
 		res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -106,7 +106,7 @@ public class ConversionUtils {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
-		res.setOpenaireId(result.getId());
+		res.setOpenaireId(cleanOpenaireId(result.getId()));
 		res.setOriginalId(first(result.getOriginalId()));
 		res.setTypology(classId(result.getResulttype()));
 		res.setTitles(structPropList(result.getTitle()));
@ -129,6 +129,10 @@ public class ConversionUtils {
 		return res;
 	}
 	public static String cleanOpenaireId(final String id) {
 		return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
 	}
 	private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
 		if (author == null) {
 			return null;
@ -188,7 +192,7 @@ public class ConversionUtils {
 		}
 		final OaBrokerProject res = new OaBrokerProject();
-		res.setOpenaireId(p.getId());
+		res.setOpenaireId(cleanOpenaireId(p.getId()));
 		res.setTitle(fieldValue(p.getTitle()));
 		res.setAcronym(fieldValue(p.getAcronym()));
 		res.setCode(fieldValue(p.getCode()));
@ -214,7 +218,7 @@ public class ConversionUtils {
 		}
 		final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
-		res.setOpenaireId(sw.getId());
+		res.setOpenaireId(cleanOpenaireId(sw.getId()));
 		res.setName(structPropValue(sw.getTitle()));
 		res.setDescription(fieldValue(sw.getDescription()));
 		res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
@ -230,7 +234,7 @@ public class ConversionUtils {
 		final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
 		res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
-		res.setOpenaireId(ds.getId());
+		res.setOpenaireId(cleanOpenaireId(ds.getId()));
 		res.setType(classId(ds.getDatasourcetype()));
 		return res;
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java
@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
 		final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
 		collectedFromSet
 			.stream()
-			.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
+			.map(
 				s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
 					BrokerConstants.COLLECTED_FROM_REL))
 			.forEach(res::addTuple);
-		hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
+
 		hostedBySet
 			.stream()
 			.map(
 				s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
 					BrokerConstants.HOSTED_BY_REL))
 			.forEach(res::addTuple);
 		return res;
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java
@ -76,6 +76,7 @@ public class EventFinder {
 		final Set<String> dsIdWhitelist,
 		final Set<String> dsIdBlacklist,
 		final Set<String> dsTypeWhitelist,
 		final Set<String> topicWhitelist,
 		final Map<String, LongAccumulator> accumulators) {
 		final List<UpdateInfo<?>> list = new ArrayList<>();
@ -84,7 +85,13 @@ public class EventFinder {
 			for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
 				if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
 					for (final UpdateMatcher<?> matcher : matchers) {
-						list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
+						for (final UpdateInfo<?> info : matcher
 							.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
 							if (topicWhitelist == null || topicWhitelist.isEmpty()
 								|| topicWhitelist.contains(info.getTopic().getPath())) {
 								list.add(info);
 							}
 						}
 					}
 				}
 			}
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json
@ -0,0 +1,9 @@
 [
 	{
 		"paramName": "o",
 		"paramLongName": "outputDir",
 		"paramDescription": "the path where the data are stored",
 		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json
@ -7,7 +7,7 @@
 	},
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "workingDir",
 		"paramDescription": "the path where the temporary data will be stored",
 		"paramRequired": true
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
@ -6,7 +6,7 @@
            <description>the path where the graph is stored</description>
        </property>
        <property>
-            <name>workingPath</name>
+            <name>outputDir</name>
            <description>the path where the the generated data will be stored</description>
        </property>
 		<property>
@ -24,6 +24,11 @@
            <value>-</value>
            <description>a black list (comma separeted, - for empty list) of datasource ids</description>
        </property>
        <property>
            <name>topicWhitelist</name>
            <value>*</value>
            <description>a white list (comma separeted, * for all) of topics</description>
        </property>
        <property>
            <name>esEventIndexName</name>
            <description>the elasticsearch index name for events</description>
@ -36,6 +41,26 @@
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
        <property>
            <name>esBatchWriteRetryCount</name>
            <value>8</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esBatchWriteRetryWait</name>
            <value>60s</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esBatchSizeEntries</name>
            <value>200</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esNodesWanOnly</name>
            <value>true</value>
            <description>an ES configuration property</description>
        </property>
        <property>
        	<name>maxIndexedEventsForDsAndTopic</name>
        	<description>the max number of events for each couple (ds/topic)</description>
@ -111,15 +136,15 @@
        </configuration>
    </global>
-    <start to="ensure_working_path"/>
+    <start to="ensure_output_dir"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    <action name="ensure_working_path">
+    <action name="ensure_output_dir">
        <fs>
-            <mkdir path='${workingPath}'/>
+            <mkdir path='${outputDir}'/>
        </fs>
        <ok to="start_entities_and_rels"/>
        <error to="Kill"/>
@ -152,7 +177,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -176,7 +201,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -201,7 +226,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -225,7 +250,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -249,7 +274,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -273,7 +298,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
@ -299,7 +324,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step1"/>
        <error to="Kill"/>
@ -323,7 +348,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step2"/>
        <error to="Kill"/>
@ -347,7 +372,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step3"/>
        <error to="Kill"/>
@ -371,7 +396,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="join_entities_step4"/>
        <error to="Kill"/>
@ -395,7 +420,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="prepare_groups"/>
        <error to="Kill"/>
@ -419,7 +444,7 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="generate_events"/>
        <error to="Kill"/>
@ -442,10 +467,12 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--outputDir</arg><arg>${outputDir}</arg>
 			<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
 			<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
 			<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
 			<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
        </spark>
        <ok to="index_event_subset"/>
        <error to="Kill"/>
@ -468,9 +495,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--index</arg><arg>${esEventIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
@ -495,9 +526,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--index</arg><arg>${esNotificationsIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
        <ok to="stats"/>
@ -521,7 +556,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
            <arg>--dbUser</arg><arg>${brokerDbUser}</arg>
            <arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json
@ -1,7 +1,13 @@
 [
 	{
 		"paramName": "wp",
 		"paramLongName": "workingDir",
 		"paramDescription": "the path where the temporary data are stored",
 		"paramRequired": true
 	},
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "outputDir",
 		"paramDescription": "the path where the generated events will be stored",
 		"paramRequired": true
 	},
@ -22,5 +28,11 @@
 		"paramLongName": "datasourceIdBlacklist",
 		"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
 		"paramRequired": true
 	},
 	{
 		"paramName": "topicWhitelist",
 		"paramLongName": "topicWhitelist",
 		"paramDescription": "a white list (comma separeted, * for all) of topics",
 		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json
@ -1,8 +1,8 @@
 [
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "outputDir",
-		"paramDescription": "the workinh path",
+		"paramDescription": "the data path",
 		"paramRequired": true
 	},
 	{
@ -16,5 +16,29 @@
 		"paramLongName": "esHost",
 		"paramDescription": "the ES host",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchWriteRetryCount",
 		"paramLongName": "esBatchWriteRetryCount",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchWriteRetryWait",
 		"paramLongName": "esBatchWriteRetryWait",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchSizeEntries",
 		"paramLongName": "esBatchSizeEntries",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esNodesWanOnly",
 		"paramLongName": "esNodesWanOnly",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json
@ -1,8 +1,8 @@
 [
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "outputDir",
-		"paramDescription": "the workinh path",
+		"paramDescription": "the path where the generated data are stored",
 		"paramRequired": true
 	},
 	{
@ -17,6 +17,30 @@
 		"paramDescription": "the ES host",
 		"paramRequired": true
 	},	
 	{
 		"paramName": "esBatchWriteRetryCount",
 		"paramLongName": "esBatchWriteRetryCount",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchWriteRetryWait",
 		"paramLongName": "esBatchWriteRetryWait",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchSizeEntries",
 		"paramLongName": "esBatchSizeEntries",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esNodesWanOnly",
 		"paramLongName": "esNodesWanOnly",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},	
 	{
 		"paramName": "n",
 		"paramLongName": "maxEventsForTopic",
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json
@ -1,8 +1,8 @@
 [
 	{
 		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "outputDir",
-		"paramDescription": "the workinh path",
+		"paramDescription": "the dir that contains the events folder",
 		"paramRequired": true
 	},
 	{
@ -17,6 +17,30 @@
 		"paramDescription": "the ES host",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchWriteRetryCount",
 		"paramLongName": "esBatchWriteRetryCount",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchWriteRetryWait",
 		"paramLongName": "esBatchWriteRetryWait",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esBatchSizeEntries",
 		"paramLongName": "esBatchSizeEntries",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "esNodesWanOnly",
 		"paramLongName": "esNodesWanOnly",
 		"paramDescription": "an ES configuration property",
 		"paramRequired": true
 	},
 	{
 		"paramName": "broker",
 		"paramLongName": "brokerApiBaseUrl",
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
@ -6,8 +6,8 @@
            <description>the path where the graph is stored</description>
        </property>
        <property>
-            <name>workingPath</name>
+            <name>outputDir</name>
-            <description>the path where the the generated data will be stored</description>
+            <description>the path where the the generated data are stored</description>
        </property>
 		<property>
            <name>datasourceIdWhitelist</name>
@ -36,6 +36,26 @@
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
        <property>
            <name>esBatchWriteRetryCount</name>
            <value>8</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esBatchWriteRetryWait</name>
            <value>60s</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esBatchSizeEntries</name>
            <value>200</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esNodesWanOnly</name>
            <value>true</value>
            <description>an ES configuration property</description>
        </property>
        <property>
        	<name>maxIndexedEventsForDsAndTopic</name>
        	<description>the max number of events for each couple (ds/topic)</description>
@ -122,9 +142,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--index</arg><arg>${esNotificationsIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json
@ -0,0 +1,14 @@
 [
 	{
 		"paramName": "o",
 		"paramLongName": "outputDir",
 		"paramDescription": "the path where the data will be stored",
 		"paramRequired": true
 	},
 	{
 		"paramName": "list",
 		"paramLongName": "opendoarIds",
 		"paramDescription": "the opendoar IDs whitelist (comma separated)",
 		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml
@ -0,0 +1,99 @@
 <workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>opendoarIds</name>
            <description>the opendoar IDs whitelist (comma separated)</description>
        </property>
        <property>
            <name>outputDir</name>
            <description>the path where the the generated data will be stored</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="opendoarPartition"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
   <action name="opendoarPartition">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>PartitionEventsByDsIdJob</name>
            <class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--outputDir</arg><arg>${outputDir}</arg>
            <arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml
@ -0,0 +1,18 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml
@ -1,41 +1,38 @@
-<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
-            <name>graphInputPath</name>
+            <name>outputDir</name>
            <description>the path where the graph is stored</description>
        </property>
        <property>
            <name>workingPath</name>
            <description>the path where the the generated data will be stored</description>
        </property>
 		<property>
            <name>datasourceIdWhitelist</name>
            <value>-</value>
            <description>a white list (comma separeted, - for empty list) of datasource ids</description>
        </property>
 		<property>
            <name>datasourceTypeWhitelist</name>
            <value>-</value>
            <description>a white list (comma separeted, - for empty list) of datasource types</description>
        </property>
 		<property>
            <name>datasourceIdBlacklist</name>
            <value>-</value>
            <description>a black list (comma separeted, - for empty list) of datasource ids</description>
        </property>
        <property>
            <name>esEventIndexName</name>
            <description>the elasticsearch index name for events</description>
        </property>
        <property>
            <name>esNotificationsIndexName</name>
            <description>the elasticsearch index name for notifications</description>
        </property>
        <property>
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
        <property>
            <name>esBatchWriteRetryCount</name>
            <value>8</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esBatchWriteRetryWait</name>
            <value>60s</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esBatchSizeEntries</name>
            <value>200</value>
            <description>an ES configuration property</description>
        </property>
 		<property>
            <name>esNodesWanOnly</name>
            <value>true</value>
            <description>an ES configuration property</description>
        </property>
        <property>
        	<name>maxIndexedEventsForDsAndTopic</name>
        	<description>the max number of events for each couple (ds/topic)</description>
@ -44,18 +41,6 @@
        	<name>brokerApiBaseUrl</name>
        	<description>the url of the broker service api</description>
        </property>
        <property>
        	<name>brokerDbUrl</name>
        	<description>the url of the broker database</description>
        </property>
        <property>
        	<name>brokerDbUser</name>
        	<description>the user of the broker database</description>
        </property>
        <property>
        	<name>brokerDbPassword</name>
        	<description>the password of the broker database</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -111,36 +96,45 @@
        </configuration>
    </global>
-    <start to="partition"/>
+    <start to="index_event_subset"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-   <action name="partition">
+     <action name="index_event_subset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>PartitionEventsByDsIdJob</name>
+            <name>IndexEventSubsetOnESJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
+            <class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.dynamicAllocation.maxExecutors="8" 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--outputDir</arg><arg>${outputDir}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--index</arg><arg>${esEventIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
            <arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
            <arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
            <arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
            <arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
            <arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
            <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json
@ -1,8 +1,8 @@
 [
 	{
-		"paramName": "wp",
+		"paramName": "o",
-		"paramLongName": "workingPath",
+		"paramLongName": "outputDir",
-		"paramDescription": "the working path",
+		"paramDescription": "the path where generated data are stored",
 		"paramRequired": true
 	},
 	{
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -10,10 +10,11 @@ import java.io.Serializable;
 import java.nio.file.Paths;
 import java.util.*;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.pace.util.MapDocumentUtil;
@ -100,8 +101,8 @@ public class EntityMergerTest implements Serializable {
 		assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
 		// verify authors
-		assertEquals(pub_merged.getAuthor().size(), 9);
+		assertEquals(13, pub_merged.getAuthor().size());
-		assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
+		assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
 		// verify title
 		int count = 0;
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -7,7 +7,6 @@ import java.util.List;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
@ -16,8 +15,8 @@ import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.codehaus.jackson.map.ObjectMapper;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.hash.Hashing;
 import eu.dnetlib.dedup.graph.ConnectedComponent;
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -10,7 +10,8 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
-import org.codehaus.jackson.map.ObjectMapper;
+
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
-import org.codehaus.jackson.map.ObjectMapper
+import com.fasterxml.jackson.databind.ObjectMapper
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
 import scala.io.Source
 case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
@ -19,23 +18,18 @@ case class HostedByItemType(id: String, officialname: String, issn: String, eiss
 case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
 object DoiBoostMappingUtil {
  def getUnknownCountry(): Qualifier = {
    createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
  }
  def generateMAGAffiliationId(affId: String): String = {
    s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
  }
  val logger: Logger = LoggerFactory.getLogger(getClass)
  //STATIC STRING
  val MAG = "microsoft"
  val MAG_NAME = "Microsoft Academic Graph"
-  val ORCID = "ORCID"
+  val ORCID = "orcid"
  val ORCID_PENDING = "orcid_pending"
  val CROSSREF = "Crossref"
  val UNPAYWALL = "UnpayWall"
  val GRID_AC = "grid.ac"
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@ -39,33 +39,38 @@ object SparkGenerateDOIBoostActionSet {
    val dbaffiliationRelationPath   = parser.get("dbaffiliationRelationPath")
    val dbOrganizationPath          = parser.get("dbOrganizationPath")
    val workingDirPath              = parser.get("targetPath")
    val sequenceFilePath            = parser.get("sFilePath")
-    spark.read.load(dbDatasetPath).as[OafDataset]
+    val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
      .map(d =>DoiBoostMappingUtil.fixResult(d))
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
-    spark.read.load(dbPublicationPath).as[Publication]
+    val asPublication =spark.read.load(dbPublicationPath).as[Publication]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
-    spark.read.load(dbOrganizationPath).as[Organization]
+    val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
-    spark.read.load(crossRefRelation).as[Relation]
+    val asCRelation = spark.read.load(crossRefRelation).as[Relation]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
-    spark.read.load(dbaffiliationRelationPath).as[Relation]
+    val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
    val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
-    d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
+
    val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
 //      spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
    d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
@ -30,7 +31,7 @@ object SparkGenerateDoiBoost {
    import spark.implicits._
    val hostedByMapPath = parser.get("hostedByMapPath")
-    val workingDirPath = parser.get("workingDirPath")
+    val workingDirPath = parser.get("workingPath")
    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
@ -62,7 +63,7 @@ object SparkGenerateDoiBoost {
    val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
    fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
-    logger.info("Phase 3) Join Result with MAG")
+    logger.info("Phase 4) Join Result with MAG")
    val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
    val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
@ -132,7 +133,7 @@ object SparkGenerateDoiBoost {
          o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
        if (affiliation.OfficialPage.isDefined)
          o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
-        o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
+        o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
        o
      }
      else
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -200,7 +200,7 @@ case object Crossref2Oaf {
    a.setSurname(family)
    a.setFullname(s"$given $family")
    if (StringUtils.isNotBlank(orcid))
-      a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava)
+      a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava)
    a
  }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java
@ -2,18 +2,16 @@
 package eu.dnetlib.doiboost.crossref;
 import java.io.ByteArrayOutputStream;
 import java.util.Optional;
 import java.util.zip.Inflater;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -30,34 +28,45 @@ public class CrossrefImporter {
 		parser.parseArgument(args);
-		final String hdfsuri = parser.get("namenode");
+		final String namenode = parser.get("namenode");
-		System.out.println("HDFS URI" + hdfsuri);
+		System.out.println("namenode: " + namenode);
 		Path hdfswritepath = new Path(parser.get("targetPath"));
 		System.out.println("TargetPath: " + hdfsuri);
-		final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
+		Path targetPath = new Path(parser.get("targetPath"));
-			? Long.parseLong(parser.get("timestamp"))
+		System.out.println("targetPath: " + targetPath);
 			: -1;
-		if (timestamp > 0)
+		final Long timestamp = Optional
-			System.out.println("Timestamp added " + timestamp);
+			.ofNullable(parser.get("timestamp"))
 			.map(s -> {
 				try {
 					return Long.parseLong(s);
 				} catch (NumberFormatException e) {
 					return -1L;
 				}
 			})
 			.orElse(-1L);
 		System.out.println("timestamp: " + timestamp);
 		final String esServer = parser.get("esServer");
 		System.out.println("esServer: " + esServer);
 		final String esIndex = parser.get("esIndex");
 		System.out.println("esIndex: " + esIndex);
 		// ====== Init HDFS File System Object
 		Configuration conf = new Configuration();
 		// Set FileSystem URI
-		conf.set("fs.defaultFS", hdfsuri);
+		conf.set("fs.defaultFS", namenode);
 		// Because of Maven
 		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-		ESClient client = timestamp > 0
+		// "ip-90-147-167-25.ct1.garrservices.it", "crossref"
-			? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
+		final ESClient client = new ESClient(esServer, esIndex, timestamp);
 			: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
 		try (SequenceFile.Writer writer = SequenceFile
 			.createWriter(
 				conf,
-				SequenceFile.Writer.file(hdfswritepath),
+				SequenceFile.Writer.file(targetPath),
 				SequenceFile.Writer.keyClass(IntWritable.class),
 				SequenceFile.Writer.valueClass(Text.class))) {
@ -74,8 +83,7 @@ public class CrossrefImporter {
 					end = System.currentTimeMillis();
 					final float time = (end - start) / 1000.0F;
 					System.out
-						.println(
+						.println(String.format("Imported %s records last 100000 imported in %s seconds", i, time));
 							String.format("Imported %d records last 100000 imported in %f seconds", i, time));
 					start = System.currentTimeMillis();
 				}
 			}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java
@ -1,11 +1,11 @@
 package eu.dnetlib.doiboost.crossref;
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;
 import org.apache.commons.io.IOUtils;
 import org.apache.http.HttpHeaders;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpPost;
 import org.apache.http.entity.StringEntity;
@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory;
 import com.jayway.jsonpath.JsonPath;
 public class ESClient implements Iterator<String> {
 	private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
-	static final String blobPath = "$.hits[*].hits[*]._source.blob";
+	private static final String BLOB_PATH = "$.hits.hits[*]._source.blob";
-	static final String scrollIdPath = "$._scroll_id";
+	private static final String SCROLL_ID_PATH = "$._scroll_id";
-	static final String JSON_NO_TS = "{\"size\":1000}";
+	private static final String JSON_NO_TS = "{\"size\":1000}";
-	static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
+	private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
-	static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
+	private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
 	public static final String APPLICATION_JSON = "application/json";
 	public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m";
 	public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll";
 	private final String scrollId;
@ -31,47 +35,30 @@ public class ESClient implements Iterator<String> {
 	private final String esHost;
-	public ESClient(final String esHost, final String esIndex) throws IOException {
+	public ESClient(final String esHost, final String esIndex, final long timestamp) {
 		this.esHost = esHost;
 		final String body = getResponse(
 			String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
 		scrollId = getJPathString(scrollIdPath, body);
 		buffer = getBlobs(body);
 	}
-	public ESClient(final String esHost, final String esIndex, final long timestamp)
+		final String body = timestamp > 0
-		throws IOException {
+			? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp))
-		this.esHost = esHost;
+			: getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS);
-		final String body = getResponse(
+		scrollId = getJPathString(SCROLL_ID_PATH, body);
 			String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
 			String.format(JSON_WITH_TS, timestamp));
 		scrollId = getJPathString(scrollIdPath, body);
 		buffer = getBlobs(body);
 	}
 	private String getResponse(final String url, final String json) {
-		CloseableHttpClient client = HttpClients.createDefault();
+		try (CloseableHttpClient client = HttpClients.createDefault()) {
 		try {
 			HttpPost httpPost = new HttpPost(url);
 			if (json != null) {
 				StringEntity entity = new StringEntity(json);
 				httpPost.setEntity(entity);
-				httpPost.setHeader("Accept", "application/json");
+				httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON);
-				httpPost.setHeader("Content-type", "application/json");
+				httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON);
 			}
-			CloseableHttpResponse response = client.execute(httpPost);
+			try (CloseableHttpResponse response = client.execute(httpPost)) {
 				return IOUtils.toString(response.getEntity().getContent());
 			}
 		} catch (Throwable e) {
 			throw new RuntimeException("Error on executing request ", e);
 		} finally {
 			try {
 				client.close();
 			} catch (IOException e) {
 				throw new RuntimeException("Unable to close client ", e);
 			}
 		}
 	}
@ -87,7 +74,7 @@ public class ESClient implements Iterator<String> {
 	}
 	private List<String> getBlobs(final String body) {
-		final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
+		final List<String> res = JsonPath.read(body, BLOB_PATH);
 		return res;
 	}
@ -102,11 +89,11 @@ public class ESClient implements Iterator<String> {
 		if (buffer.isEmpty()) {
 			final String json_param = String.format(JSON_SCROLL, scrollId);
-			final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
+			final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param);
 			try {
 				buffer = getBlobs(body);
 			} catch (Throwable e) {
-				logger.error("Error on  get next page: body:" + body);
+				System.out.println("Error on  get next page: body:" + body);
 			}
 		}
 		return nextItem;
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {
  val stream = Map(
-    "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
+    "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
-    "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
+    "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
-    "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
+    "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
    "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
    "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
    "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
    "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
    "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
-    "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
+    //                                                         ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
-    "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
+    "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
    "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
    "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
    "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
    "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
    "PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
    "PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
    "PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
-    "Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
+    "Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
    "RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
  )
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala
@ -11,7 +11,7 @@ import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
-object SparkPreProcessMAG {
+object SparkProcessMAG {
  def main(args: Array[String]): Unit = {
    val logger: Logger = LoggerFactory.getLogger(getClass)
@ -26,12 +26,15 @@ object SparkPreProcessMAG {
        .master(parser.get("master")).getOrCreate()
    val sourcePath = parser.get("sourcePath")
    val workingPath = parser.get("workingPath")
    val targetPath = parser.get("targetPath")
    import spark.implicits._
    implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
    logger.info("Phase 1) make uninque DOI in Papers:")
-    val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
+    val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
    // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
    val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
      .map(_._2)
    val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
-    distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
+
    distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
    logger.info("Phase 0) Enrich Publication with description")
-    val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
+    val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
-    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
+    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
    logger.info("Phase 3) Group Author by PaperId")
    val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
        } else
          mpa
      }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
-      .write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
    logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
    val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
-    val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
+    val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
-    val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
+    val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
    val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
    firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
      .map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
-      .write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
    var magPubs: Dataset[(String, Publication)] =
-      spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
+      spark.read.load(s"$workingPath/merge_step_2").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
@ -95,10 +99,10 @@ object SparkPreProcessMAG {
      .map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
      .write
      .mode(SaveMode.Overwrite)
-      .save(s"${parser.get("targetPath")}/merge_step_2_conference")
+      .save(s"$workingPath/merge_step_2_conference")
-    magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
+    magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
    val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
    magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
      .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
      .write.mode(SaveMode.Overwrite)
-      .save(s"${parser.get("targetPath")}/merge_step_3")
+      .save(s"$workingPath/merge_step_3")
 //    logger.info("Phase 6) Enrich Publication with description")
 //    val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
 //    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
-    val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
+    val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
-    magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
+    magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
    magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
      .map(item => ConversionUtil.updatePubsWithDescription(item)
-    ).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
+    ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
    logger.info("Phase 7) Enrich Publication with FieldOfStudy")
-    magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
+    magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
    val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
      .equalTo(paperField("PaperId")), "left")
      .map(item => ConversionUtil.updatePubsWithSubject(item))
      .write.mode(SaveMode.Overwrite)
-      .save(s"${parser.get("targetPath")}/mag_publication")
+      .save(s"$workingPath/mag_publication")
-    val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
+    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
    .map(_._2)
-    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
+    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
@ -1,10 +1,11 @@
 package eu.dnetlib.doiboost.orcid
-import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
+import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
 import eu.dnetlib.dhp.schema.orcid.OrcidDOI
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
 import org.apache.commons.lang.StringUtils
 import org.codehaus.jackson.map.ObjectMapper
 import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
@ -17,7 +18,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err
 case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
 object ORCIDToOAF {
  val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
-  val mapper = new ObjectMapper
+  val mapper = new ObjectMapper()
  def isJsonValid(inputStr: String): Boolean = {
    import java.io.IOException
@ -43,16 +44,19 @@ object ORCIDToOAF {
  }
-  def convertTOOAF(input:ORCIDElement) :Publication = {
+  def convertTOOAF(input:OrcidDOI) :Publication = {
-    val doi = input.doi
+    val doi = input.getDoi
    val pub:Publication = new Publication
-    pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
+    pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
    pub.setDataInfo(generateDataInfo())
    pub.setId(generateIdentifier(pub, doi.toLowerCase))
    try{
-      pub.setAuthor(input.authors.map(a=> {
+
-        generateAuthor(a.name, a.surname, a.creditName, a.oid)
+      val l:List[Author]= input.getAuthors.asScala.map(a=> {
-      }).asJava)
+              generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
            })(collection.breakOut)
      pub.setAuthor(l.asJava)
      pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
      pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
      pub
@ -63,6 +67,13 @@ object ORCIDToOAF {
    }
  }
  def generateOricPIDDatainfo():DataInfo = {
    val di =DoiBoostMappingUtil.generateDataInfo("0.91")
    di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
    di.getProvenanceaction.setClassname("Harvested")
    di
  }
  def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
    val a = new Author
    a.setName(given)
@ -72,7 +83,7 @@ object ORCIDToOAF {
    else
      a.setFullname(s"$given $family")
    if (StringUtils.isNotBlank(orcid))
-      a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
+      a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)
    a
  }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
@ -1,21 +1,72 @@
 package eu.dnetlib.doiboost.orcid
 import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.oaf.Publication
 import eu.dnetlib.dhp.schema.orcid.OrcidDOI
 import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 object SparkConvertORCIDToOAF {
  val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
    override def zero: Publication = new Publication()
    override def reduce(b: Publication, a: (String, Publication)): Publication = {
      b.mergeFrom(a._2)
      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
      if (b.getId == null)
        b.setId(a._2.getId)
      b
    }
    override def merge(wx: Publication, wy: Publication): Publication = {
      wx.mergeFrom(wy)
      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
      if(wx.getId == null && wy.getId.nonEmpty)
        wx.setId(wy.getId)
      wx
    }
    override def finish(reduction: Publication): Publication = reduction
    override def bufferEncoder: Encoder[Publication] =
      Encoders.kryo(classOf[Publication])
    override def outputEncoder: Encoder[Publication] =
      Encoders.kryo(classOf[Publication])
  }
 def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
  implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
  implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
  implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
  val mapper = new ObjectMapper()
  mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
  val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
  logger.info("Converting ORCID to OAF")
  dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
    .map(d => (d.getId, d))
    .groupByKey(_._1)(Encoders.STRING)
    .agg(getPublicationAggregator().toColumn)
    .map(p => p._2)
    .write.mode(SaveMode.Overwrite).save(targetPath)
 }
  def main(args: Array[String]): Unit = {
-    val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
+
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
    parser.parseArgument(args)
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
-    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
+
-    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
+
    import spark.implicits._
    val sourcePath = parser.get("sourcePath")
    val targetPath = parser.get("targetPath")
-    val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
+    run(spark, sourcePath, targetPath)
    logger.info("Converting ORCID to OAF")
    val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
      .map(_._2)
    spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java
@ -3,10 +3,8 @@ package eu.dnetlib.doiboost.orcid;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.List;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
@ -18,11 +16,9 @@ import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.util.LongAccumulator;
 import org.mortbay.log.Log;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -37,7 +33,7 @@ public class SparkDownloadOrcidAuthors {
 	static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
 	static String lastUpdate;
-	public static void main(String[] args) throws IOException, Exception {
+	public static void main(String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
@ -52,12 +48,12 @@ public class SparkDownloadOrcidAuthors {
 			.orElse(Boolean.TRUE);
 		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String workingPath = parser.get("workingPath");
-		logger.info("workingPath: ", workingPath);
+		logger.info("workingPath: {}", workingPath);
 		final String outputPath = parser.get("outputPath");
-		logger.info("outputPath: ", outputPath);
+		logger.info("outputPath: {}", outputPath);
 		final String token = parser.get("token");
 		final String lambdaFileName = parser.get("lambdaFileName");
-		logger.info("lambdaFileName: ", lambdaFileName);
+		logger.info("lambdaFileName: {}", lambdaFileName);
 		lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
@ -179,8 +175,8 @@ public class SparkDownloadOrcidAuthors {
 	}
 	private static boolean isModified(String orcidId, String modifiedDate) {
-		Date modifiedDateDt = null;
+		Date modifiedDateDt;
-		Date lastUpdateDt = null;
+		Date lastUpdateDt;
 		try {
 			if (modifiedDate.length() != 19) {
 				modifiedDate = modifiedDate.substring(0, 19);
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
@ -5,5 +5,6 @@
  {"paramName": "cr",   "paramLongName":"crossRefRelation",                 "paramDescription": "the UnpayWall Publication Path",  "paramRequired": true},
  {"paramName": "da",   "paramLongName":"dbaffiliationRelationPath",        "paramDescription": "the MAG Publication Path",        "paramRequired": true},
  {"paramName": "do",   "paramLongName":"dbOrganizationPath",               "paramDescription": "the MAG Publication Path",        "paramRequired": true},
-  {"paramName": "w",    "paramLongName":"targetPath",                       "paramDescription": "the Working Path",                "paramRequired": true}
+  {"paramName": "w",    "paramLongName":"targetPath",                       "paramDescription": "the Working Path",                "paramRequired": true},
  {"paramName": "sp",    "paramLongName":"sFilePath",                       "paramDescription": "the Sequence file Path",          "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json
@ -3,5 +3,5 @@
  {"paramName": "hb",   "paramLongName":"hostedByMapPath",            "paramDescription": "the hosted By Map Path",         "paramRequired": true},
  {"paramName": "ap",   "paramLongName":"affiliationPath",            "paramDescription": "the Affliation Path",            "paramRequired": true},
  {"paramName": "pa",   "paramLongName":"paperAffiliationPath",      "paramDescription": "the paperAffiliation Path",       "paramRequired": true},
-  {"paramName": "w",    "paramLongName":"workingDirPath",            "paramDescription": "the Working Path",                "paramRequired": true}
+  {"paramName": "w",    "paramLongName":"workingPath",                "paramDescription": "the Working Path",                "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json
@ -1,5 +1,7 @@
 [
  {"paramName":"t",   "paramLongName":"targetPath",    "paramDescription": "the path of the sequencial file to write",   "paramRequired": true},
  {"paramName":"n",   "paramLongName":"namenode",      "paramDescription": "the hive metastore uris",                    "paramRequired": true},
-  {"paramName":"ts",   "paramLongName":"timestamp",         "paramDescription": "timestamp",                                  "paramRequired": false}
+  {"paramName":"ts",  "paramLongName":"timestamp",     "paramDescription": "timestamp",                                  "paramRequired": false},
  {"paramName":"ess", "paramLongName":"esServer",     "paramDescription": "elasticsearch server url",                   "paramRequired": true},
  {"paramName":"esi", "paramLongName":"esIndex",      "paramDescription": "elasticsearch index name",                   "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml
@ -39,14 +39,7 @@
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    <action name="ResetWorkingPath">
+
        <fs>
            <delete path='${workingDirPath}'/>
            <mkdir path='${workingDirPath}'/>
        </fs>
        <ok to="CreateDOIBoost"/>
        <error to="Kill"/>
    </action>
    <action name="CreateDOIBoost">
        <spark xmlns="uri:oozie:spark-action:0.2">
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
@ -8,6 +8,10 @@
            <name>targetPath</name>
            <description>the working dir base path</description>
        </property>
        <property>
            <name>workingPath</name>
            <description>the working dir base path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -31,10 +35,10 @@
    <action name="ResetWorkingPath">
        <fs>
-            <delete path='${targetPath}'/>
+            <delete path='${workingPath}'/>
-            <mkdir path='${targetPath}'/>
+            <mkdir path='${workingPath}'/>
        </fs>
-        <ok to="PreprocessMag"/>
+        <ok to="ConvertMagToDataset"/>
        <error to="Kill"/>
    </action>
@ -52,10 +56,10 @@
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
-        <ok to="End"/>
+        <ok to="PreprocessMag"/>
        <error to="Kill"/>
    </action>
@ -65,7 +69,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
-            <name>Convert Mag to Dataset</name>
+            <name>Convert Mag to OAF Dataset</name>
            <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
@ -75,7 +79,8 @@
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--sourcePath</arg><arg>${workingPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}/process</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json
@ -1,6 +1,7 @@
 [
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input",  "paramRequired": true},
-  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the working dir path",                      "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the target dir path",                      "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingPath", "paramDescription": "the working dir path",                      "paramRequired": true},
  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",                          "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
@ -0,0 +1,42 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
    <property>
        <name>spark2ExtraListeners</name>
        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
    </property>
    <property>
        <name>spark2SqlQueryExecutionListeners</name>
        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@ -0,0 +1,335 @@
 <workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorIntersectionMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <!-- Itersection Parameters -->
        <property>
            <name>workingPath</name>
            <description>the working Path</description>
        </property>
        <property>
            <name>hostedByMapPath</name>
            <description>the hostedByMap Path</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the Path of the sequence file action set</description>
        </property>
        <!-- Crossref Parameters -->
        <property>
            <name>inputPathCrossref</name>
            <description>the Crossref input path</description>
        </property>
        <property>
            <name>crossrefTimestamp</name>
            <description>Timestamp for the Crossref incremental Harvesting</description>
        </property>
        <property>
            <name>esServer</name>
            <description>elasticsearch server url for the Crossref Harvesting</description>
        </property>
        <property>
            <name>esIndex</name>
            <description>elasticsearch index name for the Crossref Harvesting</description>
        </property>
        <!--    MAG Parameters    -->
        <property>
            <name>inputPathMAG</name>
            <description>the MAG working path</description>
        </property>
        <!--    UnpayWall Parameters    -->
        <property>
            <name>inputPathUnpayWall</name>
            <description>the UnpayWall working path</description>
        </property>
        <!--    ORCID Parameters    -->
        <property>
            <name>inputPathOrcid</name>
            <description>the ORCID working path</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="resume_from"/>
    <decision name="resume_from">
        <switch>
            <case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
            <case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
            <case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
            <case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
            <case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
            <case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
            <case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
            <default to="ImportCrossRef"/>
        </switch>
    </decision>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ImportCrossRef">
        <java>
            <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
            <arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
            <arg>--namenode</arg><arg>${nameNode}</arg>
            <arg>--esServer</arg><arg>${esServer}</arg>
            <arg>--esIndex</arg><arg>${esIndex}</arg>
            <arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
        </java>
        <ok to="GenerateCrossrefDataset"/>
        <error to="Kill"/>
    </action>
    <!-- CROSSREF SECTION -->
    <action name="GenerateCrossrefDataset">
            <spark xmlns="uri:oozie:spark-action:0.2">
                <master>yarn-cluster</master>
                <mode>cluster</mode>
                <name>GenerateCrossrefDataset</name>
                <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
                <jar>dhp-doiboost-${projectVersion}.jar</jar>
                <spark-opts>
                    --executor-memory=${sparkExecutorMemory}
                    --executor-cores=${sparkExecutorCores}
                    --driver-memory=${sparkDriverMemory}
                    --conf spark.sql.shuffle.partitions=3840
                    ${sparkExtraOPT}
                </spark-opts>
                <arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
                <arg>--master</arg><arg>yarn-cluster</arg>
            </spark>
            <ok to="RenameDataset"/>
            <error to="Kill"/>
    </action>
    <action name="RenameDataset">
        <fs>
            <delete path="${inputPathCrossref}/crossref_ds"/>
            <move source="${inputPathCrossref}/crossref_ds_updated"
                  target="${inputPathCrossref}/crossref_ds"/>
        </fs>
        <ok to="ConvertCrossrefToOAF"/>
        <error to="Kill"/>
    </action>
    <action name="ConvertCrossrefToOAF">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>ConvertCrossrefToOAF</name>
            <class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="ResetMagWorkingPath"/>
        <error to="Kill"/>
    </action>
    <!-- MAG SECTION -->
    <action name="ResetMagWorkingPath">
        <fs>
            <delete path="${inputPathMAG}/dataset"/>
            <delete path="${inputPathMAG}/process"/>
            <delete path="${inputPathMAG}/dataset"/>
        </fs>
        <ok to="ConvertMagToDataset"/>
        <error to="Kill"/>
    </action>
    <action name="ConvertMagToDataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert Mag to Dataset</name>
            <class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathMAG}/input</arg>
            <arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="ProcessMAG"/>
        <error to="Kill"/>
    </action>
    <action name="ProcessMAG">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert Mag to OAF Dataset</name>
            <class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
            <arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="ProcessUW"/>
        <error to="Kill"/>
    </action>
    <!--  UnpayWall  SECTION -->
    <action name="ProcessUW">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert UnpayWall to Dataset</name>
            <class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="ProcessORCID"/>
        <error to="Kill"/>
    </action>
    <!--  ORCID  SECTION -->
    <action name="ProcessORCID">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert ORCID to Dataset</name>
            <class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="CreateDOIBoost"/>
        <error to="Kill"/>
    </action>
    <!-- INTERSECTION SECTION-->
    <action name="CreateDOIBoost">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create DOIBoost Infospace</name>
            <class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorIntersectionMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
            <arg>--affiliationPath</arg><arg>${inputPathMAG}/process/Affiliations</arg>
            <arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/process/PaperAuthorAffiliations</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="GenerateActionSet"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateActionSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Generate DOIBoost ActionSet</name>
            <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
            <arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
            <arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
            <arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
            <arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
            <arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
            <arg>--sFilePath</arg><arg>${outputPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala
@ -1,6 +1,9 @@
 package eu.dnetlib.doiboost.orcid
-import org.codehaus.jackson.map.ObjectMapper
+import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.Publication
 import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
 import org.slf4j.{Logger, LoggerFactory}
@ -21,6 +24,30 @@ class MappingORCIDToOAFTest {
    })
  }
 //  @Test
 //  def testOAFConvert():Unit ={
 //
 //    val spark: SparkSession =
 //      SparkSession
 //        .builder()
 //        .appName(getClass.getSimpleName)
 //        .master("local[*]").getOrCreate()
 //
 //
 //    SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
 //    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
 //
 //    val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
 //    println(df.first.getId)
 //    println(mapper.writeValueAsString(df.first()))
 //
 //
 //
 //
 //  }
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
@ -104,7 +104,7 @@ public class PrepareResultOrcidAssociationStep1 {
 			+ "               LATERAL VIEW EXPLODE (author) a AS MyT "
 			+ "               LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
 			+ "               WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
-			+ "                       lower(MyP.qalifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
+			+ "                       lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
 			+ "               GROUP BY id) r_t "
 			+ " JOIN ("
 			+ "        SELECT source, target "
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java
@ -108,7 +108,7 @@ public class SparkResultToCommunityFromOrganizationJob {
 					.stream()
 					.map(con -> con.getId())
 					.collect(Collectors.toList());
-				Result res = new Result();
+				R res = (R) ret.getClass().newInstance();
 				res.setId(ret.getId());
 				List<Context> propagatedContexts = new ArrayList<>();
 				for (String cId : communitySet) {
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java
@ -130,7 +130,7 @@ public class SparkResultToCommunityThroughSemRelJob {
 						})
 					.filter(Objects::nonNull)
 					.collect(Collectors.toList());
-				Result r = new Result();
+				R r = (R) ret.getClass().newInstance();
 				r.setId(ret.getId());
 				r.setContext(contextList);
 				ret.mergeFrom(r);
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java
@ -170,6 +170,7 @@ public class OrcidPropagationJobTest {
 					.filter(
 						"id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
 							+ "and name = 'Vajinder' and surname = 'Kumar' and pidType = '" +
 							ModelConstants.ORCID_PENDING + "'")
 					.count());
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 public class ResultToCommunityJobTest {
@ -66,7 +65,7 @@ public class ResultToCommunityJobTest {
 	}
 	@Test
-	public void test1() throws Exception {
+	public void testSparkResultToCommunityThroughSemRelJob() throws Exception {
 		SparkResultToCommunityThroughSemRelJob
 			.main(
 				new String[] {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java
@ -15,8 +15,11 @@ import eu.dnetlib.dhp.schema.oaf.*;
 public class CleaningFunctions {
-	public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
+	public static final String DOI_PREFIX_REGEX = "^10\\.";
-	public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
+
 	public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
 	public static final int ORCID_LEN = 19;
 	public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
 	public static final Set<String> PID_BLACKLIST = new HashSet<>();
@ -56,11 +59,17 @@ public class CleaningFunctions {
 				}
 			}
 			if (Objects.nonNull(r.getAuthor())) {
-				r.getAuthor().forEach(a -> {
+				r
 					.getAuthor()
 					.stream()
 					.filter(Objects::nonNull)
 					.forEach(a -> {
 						if (Objects.nonNull(a.getPid())) {
-						a.getPid().forEach(p -> {
+							a
-							fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
+								.getPid()
-						});
+								.stream()
 								.filter(Objects::nonNull)
 								.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
 						}
 					});
 			}
@ -86,7 +95,7 @@ public class CleaningFunctions {
 		} else if (value instanceof Organization) {
 			Organization o = (Organization) value;
 			if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
-				o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
+				o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
 			}
 		} else if (value instanceof Relation) {
 			// nothing to clean here
@ -101,6 +110,16 @@ public class CleaningFunctions {
 					.setLanguage(
 						qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
 			}
 			if (Objects.nonNull(r.getCountry())) {
 				r
 					.setCountry(
 						r
 							.getCountry()
 							.stream()
 							.filter(Objects::nonNull)
 							.filter(c -> StringUtils.isNotBlank(c.getClassid()))
 							.collect(Collectors.toList()));
 			}
 			if (Objects.nonNull(r.getSubject())) {
 				r
 					.setSubject(
@ -153,12 +172,14 @@ public class CleaningFunctions {
 			if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
 				r
 					.setResourcetype(
-						qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
+						qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
 			}
 			if (Objects.nonNull(r.getInstance())) {
 				for (Instance i : r.getInstance()) {
 					if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
-						i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
+						i
 							.setAccessright(
 								qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
 					}
 					if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
 						i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
@ -173,12 +194,22 @@ public class CleaningFunctions {
 				if (Objects.isNull(bestaccessrights)) {
 					r
 						.setBestaccessright(
-							qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
+							qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
 				} else {
 					r.setBestaccessright(bestaccessrights);
 				}
 			}
 			if (Objects.nonNull(r.getAuthor())) {
 				r
 					.setAuthor(
 						r
 							.getAuthor()
 							.stream()
 							.filter(a -> Objects.nonNull(a))
 							.filter(a -> StringUtils.isNotBlank(a.getFullname()))
 							.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
 							.collect(Collectors.toList()));
 				boolean nullRank = r
 					.getAuthor()
 					.stream()
@ -199,6 +230,7 @@ public class CleaningFunctions {
 								a
 									.getPid()
 									.stream()
 									.filter(Objects::nonNull)
 									.filter(p -> Objects.nonNull(p.getQualifier()))
 									.filter(p -> StringUtils.isNotBlank(p.getValue()))
 									.map(p -> {
@ -211,14 +243,31 @@ public class CleaningFunctions {
 													.map(Qualifier::getClassid)
 													.orElse(""))
 											.orElse("");
-										if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
+										if (p
 											.getQualifier()
 											.getClassid()
 											.toLowerCase()
 											.contains(ModelConstants.ORCID)) {
 											if (pidProvenance
 												.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
 												p.getQualifier().setClassid(ModelConstants.ORCID);
 											} else {
 												p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
 											}
-										p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
+											final String orcid = p
 												.getValue()
 												.trim()
 												.toLowerCase()
 												.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
 											if (orcid.length() == ORCID_LEN) {
 												p.setValue(orcid);
 											} else {
 												p.setValue("");
 											}
 										}
 										return p;
 									})
 									.filter(p -> StringUtils.isNotBlank(p.getValue()))
 									.collect(
 										Collectors
 											.toMap(
@ -286,7 +335,7 @@ public class CleaningFunctions {
 			// TODO add cleaning for more PID types as needed
 			case "doi":
-				pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
+				pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
 				break;
 		}
 		return pid;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java
@ -21,6 +21,7 @@ import org.apache.spark.sql.expressions.Aggregator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.jayway.jsonpath.Configuration;
 import com.jayway.jsonpath.DocumentContext;
@ -44,7 +45,8 @@ public class GroupEntitiesAndRelationsSparkJob {
 	private final static String SOURCE_JPATH = "$.source";
-	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
 		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
 	public static void main(String[] args) throws Exception {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Constants.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Constants.java
@ -40,4 +40,18 @@ public class Constants {
 		coarCodeLabelMap.put("c_14cb", "CLOSED");
 		coarCodeLabelMap.put("c_f1cf", "EMBARGO");
 	}
 	public enum DUMPTYPE {
 		COMPLETE("complete"), COMMUNITY("community"), FUNDER("funder");
 		private String type;
 		DUMPTYPE(String type) {
 			this.type = type;
 		}
 		public String getType() {
 			return type;
 		}
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java
@ -11,17 +11,12 @@ import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
 import eu.dnetlib.dhp.oa.graph.dump.Utils;
 import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
 import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
 import eu.dnetlib.dhp.schema.oaf.*;
 /**
@ -33,7 +28,7 @@ public class DumpProducts implements Serializable {
 	public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath,
 		Class<? extends OafEntity> inputClazz,
 		Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> outputClazz,
-		boolean graph) {
+		String dumpType) {
 		SparkConf conf = new SparkConf();
@ -42,7 +37,7 @@ public class DumpProducts implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 				Utils.removeOutputDir(spark, outputPath);
-				execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, graph);
+				execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType);
 			});
 	}
@ -53,13 +48,13 @@ public class DumpProducts implements Serializable {
 		String communityMapPath,
 		Class<I> inputClazz,
 		Class<O> outputClazz,
-		boolean graph) {
+		String dumpType) {
 		CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
 		Utils
 			.readPath(spark, inputPath, inputClazz)
-			.map((MapFunction<I, O>) value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
+			.map((MapFunction<I, O>) value -> execMap(value, communityMap, dumpType), Encoders.bean(outputClazz))
 			.filter(Objects::nonNull)
 			.write()
 			.mode(SaveMode.Overwrite)
@ -70,18 +65,18 @@ public class DumpProducts implements Serializable {
 	private static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O execMap(I value,
 		CommunityMap communityMap,
-		boolean graph) {
+		String dumpType) {
 		Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
 		if (odInfo.isPresent()) {
-			if (odInfo.get().getDeletedbyinference()) {
+			if (odInfo.get().getDeletedbyinference() || odInfo.get().getInvisible()) {
 				return null;
 			}
 		} else {
 			return null;
 		}
-		if (!graph) {
+		if (Constants.DUMPTYPE.COMMUNITY.getType().equals(dumpType)) {
 			Set<String> communities = communityMap.keySet();
 			Optional<List<Context>> inputContext = Optional
@ -102,7 +97,8 @@ public class DumpProducts implements Serializable {
 				return null;
 			}
 		}
-		return (O) ResultMapper.map(value, communityMap, graph);
+
 		return (O) ResultMapper.map(value, communityMap, dumpType);
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java
@ -21,10 +21,10 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 public class ResultMapper implements Serializable {
 	public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
-		E in, Map<String, String> communityMap, boolean graph) {
+		E in, Map<String, String> communityMap, String dumpType) {
 		Result out;
-		if (graph) {
+		if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
 			out = new GraphResult();
 		} else {
 			out = new CommunityResult();
@ -217,7 +217,7 @@ public class ResultMapper implements Serializable {
 				.ofNullable(input.getInstance());
 			if (oInst.isPresent()) {
-				if (graph) {
+				if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
 					((GraphResult) out)
 						.setInstance(oInst.get().stream().map(i -> getGraphInstance(i)).collect(Collectors.toList()));
 				} else {
@ -296,7 +296,7 @@ public class ResultMapper implements Serializable {
 			out.setType(input.getResulttype().getClassid());
 		}
-		if (!graph) {
+		if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
 			((CommunityResult) out)
 				.setCollectedfrom(
 					input
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java
@ -9,6 +9,7 @@ import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
@ -54,7 +55,7 @@ public class CommunitySplit implements Serializable {
 	private static void printResult(String c, Dataset<CommunityResult> result, String outputPath) {
 		Dataset<CommunityResult> community_products = result
-			.filter(r -> containsCommunity(r, c));
+			.filter((FilterFunction<CommunityResult>) r -> containsCommunity(r, c));
 		try {
 			community_products.first();
--- a/Show More
+++ b/Show More