Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into convert_hive_to_spark_actions

# Conflicts: # dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql
2024-06-25 20:17:47 +03:00 · 2024-06-25 20:17:47 +03:00 · fe2275a9b0
parent a644a6f4fe 66cd28f70a
commit fe2275a9b0
155 changed files with 5516 additions and 4935 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
+/dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;

 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
+			mojo.execute();
+			Assertions.assertTrue(false); // not reached
+		} catch (Exception e) {
+			Assertions
+				.assertTrue(
+					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+		}
 	}

 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -63,15 +63,14 @@

 	<dependencies>
 		<dependency>
-			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>dhp-pace-core</artifactId>
-			<version>${project.version}</version>
+			<groupId>edu.cmu</groupId>
+			<artifactId>secondstring</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>com.ibm.icu</groupId>
+			<artifactId>icu4j</artifactId>
 		</dependency>

-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -161,7 +160,7 @@

 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>

 		<dependency>
@ -170,4 +169,23 @@
 		</dependency>
 	</dependencies>

+	<!-- dependencies required on JDK9+ because J2EE has been removed -->
+	<profiles>
+		<profile>
+			<id>spark-34</id>
+			<dependencies>
+				<dependency>
+					<groupId>javax.xml.bind</groupId>
+					<artifactId>jaxb-api</artifactId>
+					<version>2.2.11</version>
+				</dependency>
+				<dependency>
+					<groupId>com.sun.xml.ws</groupId>
+					<artifactId>jaxws-ri</artifactId>
+					<version>2.3.3</version>
+					<type>pom</type>
+				</dependency>
+			</dependencies>
+		</profile>
+	</profiles>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
 	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
 	 *            concept_rec_id = 656930
 	 * @return response code
-	 * @throws IOException
-	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
 		setDepositionId(concept_rec_id, 1);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
-import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
 import com.wcohen.ss.JaroWinkler;

 import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.model.Person;
 import scala.Tuple2;
@ -146,10 +147,20 @@ public class AuthorMerger {
 	}

 	public static String pidToComparableString(StructuredProperty pid) {
-		final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
-			: "";
-		return (pid.getQualifier() != null ? classid : "")
-			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
+		final String classId = Optional
+			.ofNullable(pid)
+			.map(
+				p -> Optional
+					.ofNullable(p.getQualifier())
+					.map(Qualifier::getClassid)
+					.map(String::toLowerCase)
+					.orElse(""))
+			.orElse("");
+		return Optional
+			.ofNullable(pid)
+			.map(StructuredProperty::getValue)
+			.map(v -> String.join("|", v, classId))
+			.orElse("");
 	}

 	public static int countAuthorsPids(List<Author> authors) {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java
@ -135,7 +135,7 @@ public class GroupEntitiesSparkJob {
 					.applyCoarVocabularies(entity, vocs),
 				OAFENTITY_KRYO_ENC)
 			.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
-			.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeGroup, OAFENTITY_KRYO_ENC)
+			.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
 			.map(
 				(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
 					t.getClass().getName(), t),
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
@ -0,0 +1,106 @@
+
+package eu.dnetlib.dhp.schema.oaf.utils;
+
+import java.util.*;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class MergeEntitiesComparator implements Comparator<Oaf> {
+	static final List<String> PID_AUTHORITIES = Arrays
+		.asList(
+			ModelConstants.ARXIV_ID,
+			ModelConstants.PUBMED_CENTRAL_ID,
+			ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
+			ModelConstants.DATACITE_ID,
+			ModelConstants.CROSSREF_ID);
+
+	static final List<String> RESULT_TYPES = Arrays
+		.asList(
+			ModelConstants.ORP_RESULTTYPE_CLASSID,
+			ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
+			ModelConstants.DATASET_RESULTTYPE_CLASSID,
+			ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
+
+	public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
+
+	@Override
+	public int compare(Oaf left, Oaf right) {
+		if (left == null && right == null)
+			return 0;
+		if (left == null)
+			return -1;
+		if (right == null)
+			return 1;
+
+		int res = 0;
+
+		// pid authority
+		int cfp1 = Optional
+			.ofNullable(left.getCollectedfrom())
+			.map(
+				cf -> cf
+					.stream()
+					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
+					.max(Integer::compare)
+					.orElse(-1))
+			.orElse(-1);
+		int cfp2 = Optional
+			.ofNullable(right.getCollectedfrom())
+			.map(
+				cf -> cf
+					.stream()
+					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
+					.max(Integer::compare)
+					.orElse(-1))
+			.orElse(-1);
+
+		if (cfp1 >= 0 && cfp1 > cfp2) {
+			return 1;
+		} else if (cfp2 >= 0 && cfp2 > cfp1) {
+			return -1;
+		}
+
+		// trust
+		if (left.getDataInfo() != null && right.getDataInfo() != null) {
+			res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
+		}
+
+		// result type
+		if (res == 0) {
+			if (left instanceof Result && right instanceof Result) {
+				Result r1 = (Result) left;
+				Result r2 = (Result) right;
+
+				if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
+					if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
+						return -1;
+					}
+				} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
+					return 1;
+				}
+
+				int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
+				int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
+
+				if (rt1 >= 0 && rt1 > rt2) {
+					return 1;
+				} else if (rt2 >= 0 && rt2 > rt1) {
+					return -1;
+				}
+			}
+		}
+
+		// id
+		if (res == 0) {
+			if (left instanceof OafEntity && right instanceof OafEntity) {
+				res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
+			}
+		}
+
+		return res;
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -30,40 +30,37 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;

 public class MergeUtils {
+	public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
+		return mergeGroup(s, oafEntityIterator, true);
+	}

 	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
-		TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
-			int res = 0;
-
-			if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
-				res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
+		return mergeGroup(s, oafEntityIterator, false);
 	}

-			if (res == 0) {
-				if (o1 instanceof Result && o2 instanceof Result) {
-					return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
-				}
-			}
+	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
+		boolean checkDelegateAuthority) {

-			return res;
-		});
+		ArrayList<T> sortedEntities = new ArrayList<>();
+		oafEntityIterator.forEachRemaining(sortedEntities::add);
+		sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());

-		while (oafEntityIterator.hasNext()) {
-			sortedEntities.add(oafEntityIterator.next());
-		}
+		Iterator<T> it = sortedEntities.iterator();
+		T merged = it.next();

-		T merged = sortedEntities.descendingIterator().next();
-
-		Iterator<T> it = sortedEntities.descendingIterator();
 		while (it.hasNext()) {
-			merged = checkedMerge(merged, it.next());
+			merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
 		}

 		return merged;
 	}

-	public static <T extends Oaf> T checkedMerge(final T left, final T right) {
-		return (T) merge(left, right, false);
+	public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
+		return (T) merge(left, right, checkDelegateAuthority);
+	}
+
+	public static <T extends Result, E extends Result> Result mergeResult(final T left, final E right) {
+		return (Result) merge(left, right, false);
 	}

 	public static Oaf merge(final Oaf left, final Oaf right) {
@ -108,7 +105,7 @@ public class MergeUtils {
 				return mergeSoftware((Software) left, (Software) right);
 			}

-			return mergeResult((Result) left, (Result) right);
+			return mergeResultFields((Result) left, (Result) right);
 		} else if (sameClass(left, right, Datasource.class)) {
 			// TODO
 			final int trust = compareTrust(left, right);
@ -131,7 +128,7 @@ public class MergeUtils {
 	 * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
 	 * such version.
 	 * <p>
-	 * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
+	 * Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
 	 * and proceeds with the canonical property merging.
 	 *
 	 * @param left
@ -149,11 +146,12 @@ public class MergeUtils {
 		if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
 			return right;
 		}
+
 		// TODO: raise trust to have preferred fields from one or the other??
-		if (new ResultTypeComparator().compare(left, right) < 0) {
-			return mergeResult(left, right);
+		if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
+			return mergeResultFields(left, right);
 		} else {
-			return mergeResult(right, left);
+			return mergeResultFields(right, left);
 		}
 	}

@ -213,9 +211,9 @@ public class MergeUtils {

 	private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
 		Function<T, K> keyExtractor, BinaryOperator<T> merger) {
-		if (left == null) {
-			return right;
-		} else if (right == null) {
+		if (left == null || left.isEmpty()) {
+			return right != null ? right : new ArrayList<>();
+		} else if (right == null || right.isEmpty()) {
 			return left;
 		}

@ -263,6 +261,12 @@ public class MergeUtils {

 	// TODO review
 	private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
+		if (left == null) {
+			return right;
+		} else if (right == null) {
+			return left;
+		}
+
 		if (trust < 0) {
 			List<KeyValue> s = left;
 			left = right;
@ -270,8 +274,9 @@ public class MergeUtils {
 		}

 		HashMap<String, KeyValue> values = new HashMap<>();
-		left.forEach(kv -> values.put(kv.getKey(), kv));
-		right.forEach(kv -> values.putIfAbsent(kv.getKey(), kv));
+
+		Optional.ofNullable(left).ifPresent(l -> l.forEach(kv -> values.put(kv.getKey(), kv)));
+		Optional.ofNullable(right).ifPresent(r -> r.forEach(kv -> values.putIfAbsent(kv.getKey(), kv)));

 		return new ArrayList<>(values.values());
 	}
@ -367,7 +372,7 @@ public class MergeUtils {
 		return merge;
 	}

-	public static <T extends Result> T mergeResult(T original, T enrich) {
+	private static <T extends Result> T mergeResultFields(T original, T enrich) {
 		final int trust = compareTrust(original, enrich);
 		T merge = mergeOafEntityFields(original, enrich, trust);

@ -386,7 +391,7 @@ public class MergeUtils {
 		}

 		// should be an instance attribute, get the first non-null value
-		merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
+		merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));

 		// distinct countries, do not manage datainfo
 		merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
@ -556,6 +561,13 @@ public class MergeUtils {
 		return m != null ? m : e;
 	}

+	private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
+		if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
+			return e;
+		}
+		return m;
+	}
+
 	private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
 		List<List<Author>> authors = new ArrayList<>();
 		if (author != null) {
@ -568,6 +580,10 @@ public class MergeUtils {
 	}

 	private static String instanceKeyExtractor(Instance i) {
+		// three levels of concatenating:
+		// 1. ::
+		// 2. @@
+		// 3. ||
 		return String
 			.join(
 				"::",
@ -575,10 +591,10 @@ public class MergeUtils {
 				kvKeyExtractor(i.getCollectedfrom()),
 				qualifierKeyExtractor(i.getAccessright()),
 				qualifierKeyExtractor(i.getInstancetype()),
-				Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
+				Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
 				Optional
 					.ofNullable(i.getPid())
-					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
+					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
 					.orElse(null));
 	}

@ -687,13 +703,13 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
+			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}

 	private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
 		int trust = compareTrust(original, enrich);
-		final T merge = mergeResult(original, enrich);
+		final T merge = mergeResultFields(original, enrich);

 		merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
 		merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
@ -704,7 +720,7 @@ public class MergeUtils {

 	private static <T extends Software> T mergeSoftware(T original, T enrich) {
 		int trust = compareTrust(original, enrich);
-		final T merge = mergeResult(original, enrich);
+		final T merge = mergeResultFields(original, enrich);

 		merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
 		merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
@ -718,7 +734,7 @@ public class MergeUtils {

 	private static <T extends Dataset> T mergeDataset(T original, T enrich) {
 		int trust = compareTrust(original, enrich);
-		T merge = mergeResult(original, enrich);
+		T merge = mergeResultFields(original, enrich);

 		merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
 		merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
@ -737,7 +753,7 @@ public class MergeUtils {

 	public static <T extends Publication> T mergePublication(T original, T enrich) {
 		final int trust = compareTrust(original, enrich);
-		T merged = mergeResult(original, enrich);
+		T merged = mergeResultFields(original, enrich);

 		merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));

@ -855,9 +871,11 @@ public class MergeUtils {
 		if (toEnrichInstances == null) {
 			return enrichmentResult;
 		}
-		if (enrichmentInstances == null) {
-			return enrichmentResult;
+
+		if (enrichmentInstances == null || enrichmentInstances.isEmpty()) {
+			return toEnrichInstances;
 		}
+
 		Map<String, Instance> ri = toInstanceMap(enrichmentInstances);

 		toEnrichInstances.forEach(i -> {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
@ -1,78 +0,0 @@
-
-package eu.dnetlib.dhp.schema.oaf.utils;
-
-import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
-
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.Optional;
-import java.util.stream.Collectors;
-
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class ResultTypeComparator implements Comparator<Result> {
-
-	public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
-
-	@Override
-	public int compare(Result left, Result right) {
-
-		if (left == null && right == null)
-			return 0;
-		if (left == null)
-			return 1;
-		if (right == null)
-			return -1;
-
-		HashSet<String> lCf = getCollectedFromIds(left);
-		HashSet<String> rCf = getCollectedFromIds(right);
-
-		if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
-			return -1;
-		}
-		if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
-			return 1;
-		}
-
-		String lClass = left.getResulttype().getClassid();
-		String rClass = right.getResulttype().getClassid();
-
-		if (!lClass.equals(rClass)) {
-			if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
-				return 1;
-		}
-
-		// Else (but unlikely), lexicographical ordering will do.
-		return lClass.compareTo(rClass);
-	}
-
-	protected HashSet<String> getCollectedFromIds(Result left) {
-		return Optional
-			.ofNullable(left.getCollectedfrom())
-			.map(
-				cf -> cf
-					.stream()
-					.map(KeyValue::getKey)
-					.collect(Collectors.toCollection(HashSet::new)))
-			.orElse(new HashSet<>());
-	}
-}
--- a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
@ -0,0 +1,101 @@
+
+package eu.dnetlib.pace.common;
+
+import java.nio.charset.StandardCharsets;
+import java.text.Normalizer;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
+/**
+ * Set of common functions for the framework
+ *
+ * @author claudio
+ */
+public class PaceCommonUtils {
+
+	// transliterator
+	protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+	protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
+	protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
+
+	protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
+
+	protected static String fixAliases(final String s) {
+		final StringBuilder sb = new StringBuilder();
+
+		s.chars().forEach(ch -> {
+			final int i = StringUtils.indexOf(aliases_from, ch);
+			sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
+		});
+
+		return sb.toString();
+	}
+
+	protected static String transliterate(final String s) {
+		try {
+			return transliterator.transliterate(s);
+		} catch (Exception e) {
+			return s;
+		}
+	}
+
+	public static String normalize(final String s) {
+		return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+			.toLowerCase()
+			// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
+			// strings
+			.replaceAll("[^ \\w]+", "")
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+			.replaceAll("(\\p{Punct})+", " ")
+			.replaceAll("(\\d)+", " ")
+			.replaceAll("(\\n)+", " ")
+			.trim();
+	}
+
+	public static String nfd(final String s) {
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
+	}
+
+	public static String unicodeNormalization(final String s) {
+
+		Matcher m = hexUnicodePattern.matcher(s);
+		StringBuffer buf = new StringBuffer(s.length());
+		while (m.find()) {
+			String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
+			m.appendReplacement(buf, Matcher.quoteReplacement(ch));
+		}
+		m.appendTail(buf);
+		return buf.toString();
+	}
+
+	public static Set<String> loadFromClasspath(final String classpath) {
+
+		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+		final Set<String> h = Sets.newHashSet();
+		try {
+			for (final String s : IOUtils
+				.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
+				h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
+			}
+		} catch (final Throwable e) {
+			return Sets.newHashSet();
+		}
+		return h;
+	}
+
+	protected static Iterable<String> tokens(final String s, final int maxTokens) {
+		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
+	}
+
+}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@ -12,7 +12,7 @@ import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.hash.Hashing;

-import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.common.PaceCommonUtils;
 import eu.dnetlib.pace.util.Capitalise;
 import eu.dnetlib.pace.util.DotAbbreviations;

@ -86,7 +86,7 @@ public class Person {

 	private List<String> splitTerms(final String s) {
 		if (particles == null) {
-			particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+			particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
 		}

 		final List<String> list = Lists.newArrayList();
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java
@ -15,4 +15,4 @@ public class Capitalise implements Function<String, String> {
 	public String apply(final String s) {
 		return WordUtils.capitalize(s.toLowerCase(), DELIM);
 	}
-};
+}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java
@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
 	public String apply(String s) {
 		return s.length() == 1 ? s + "." : s;
 	}
-};
+}
--- a/dhp-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt
+++ b/dhp-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
-      .getOrCreate()
+    if (master != null)
+      b.master(master)
+    b.getOrCreate()
  }

  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }

  def generateScholixResourceFromResult(r: Result): ScholixResource = {
+    val sum = ScholixUtils.resultToSummary(r)
+    if (sum != null)
      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    else
+      null
  }

  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {

  }

+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
-      s.setTypology(Typology.publication)
-    else
-      s.setTypology(Typology.dataset)
+//    s.setTypology(r.getResulttype.getClassid)

    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtilsTest.java
@ -63,7 +63,7 @@ public class MergeUtilsTest {
 		assertEquals(1, d1.getCollectedfrom().size());
 		assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));

-		final Result p1d2 = MergeUtils.checkedMerge(p1, d2);
+		final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true);
 		assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
 		assertTrue(p1d2 instanceof Publication);
 		assertEquals(p1.getId(), p1d2.getId());
@ -74,7 +74,7 @@ public class MergeUtilsTest {
 		Publication p2 = read("publication_2.json", Publication.class);
 		Dataset d1 = read("dataset_1.json", Dataset.class);

-		final Result p2d1 = MergeUtils.checkedMerge(p2, d1);
+		final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true);
 		assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
 		assertTrue(p2d1 instanceof Dataset);
 		assertEquals(d1.getId(), p2d1.getId());
@ -86,7 +86,7 @@ public class MergeUtilsTest {
 		Publication p1 = read("publication_1.json", Publication.class);
 		Publication p2 = read("publication_2.json", Publication.class);

-		Result p1p2 = MergeUtils.checkedMerge(p1, p2);
+		Result p1p2 = MergeUtils.checkedMerge(p1, p2, true);
 		assertTrue(p1p2 instanceof Publication);
 		assertEquals(p1.getId(), p1p2.getId());
 		assertEquals(2, p1p2.getCollectedfrom().size());
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -49,18 +49,16 @@
 	</build>

 	<dependencies>
+		<dependency>
+			<groupId>eu.dnetlib.dhp</groupId>
+			<artifactId>dhp-common</artifactId>
+			<version>${project.version}</version>
+		</dependency>
+
 		<dependency>
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>com.google.guava</groupId>
-			<artifactId>guava</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.google.code.gson</groupId>
-			<artifactId>gson</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -85,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.commons</groupId>
-			<artifactId>commons-math3</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -107,4 +101,90 @@
 		</dependency>
 	</dependencies>

+	<profiles>
+		<profile>
+			<id>spark-24</id>
+			<activation>
+				<activeByDefault>true</activeByDefault>
+			</activation>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-34</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-35</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-35</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+	</profiles>
+
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -4,7 +4,6 @@ package eu.dnetlib.pace.common;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
-import java.text.Normalizer;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@ -14,19 +13,15 @@ import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;

 import com.google.common.base.Joiner;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;

-import eu.dnetlib.pace.clustering.NGramUtils;
-
 /**
 * Set of common functions for the framework
 *
 * @author claudio
 */
-public class AbstractPaceFunctions {
+public class AbstractPaceFunctions extends PaceCommonUtils {

 	// city map to be used when translating the city names into codes
 	private static Map<String, String> cityMap = AbstractPaceFunctions
@ -41,9 +36,6 @@ public class AbstractPaceFunctions {
 	protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
 	protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");

-	// transliterator
-	protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
-
 	// blacklist of ngrams: to avoid generic keys
 	protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");

@ -51,8 +43,6 @@ public class AbstractPaceFunctions {
 	public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");

 	private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
-	private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
-	private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";

 	// doi prefix for normalization
 	public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
@ -129,25 +119,6 @@ public class AbstractPaceFunctions {
 		return numberPattern.matcher(strNum).matches();
 	}

-	protected static String fixAliases(final String s) {
-		final StringBuilder sb = new StringBuilder();
-
-		s.chars().forEach(ch -> {
-			final int i = StringUtils.indexOf(aliases_from, ch);
-			sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
-		});
-
-		return sb.toString();
-	}
-
-	protected static String transliterate(final String s) {
-		try {
-			return transliterator.transliterate(s);
-		} catch (Exception e) {
-			return s;
-		}
-	}
-
 	protected static String removeSymbols(final String s) {
 		final StringBuilder sb = new StringBuilder();

@ -162,23 +133,6 @@ public class AbstractPaceFunctions {
 		return s != null;
 	}

-	public static String normalize(final String s) {
-		return fixAliases(transliterate(nfd(unicodeNormalization(s))))
-			.toLowerCase()
-			// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
-			// strings
-			.replaceAll("[^ \\w]+", "")
-			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
-			.replaceAll("(\\p{Punct})+", " ")
-			.replaceAll("(\\d)+", " ")
-			.replaceAll("(\\n)+", " ")
-			.trim();
-	}
-
-	public static String nfd(final String s) {
-		return Normalizer.normalize(s, Normalizer.Form.NFD);
-	}
-
 	public static String utf8(final String s) {
 		byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
 		return new String(bytes, StandardCharsets.UTF_8);
@ -233,22 +187,6 @@ public class AbstractPaceFunctions {
 		return newset;
 	}

-	public static Set<String> loadFromClasspath(final String classpath) {
-
-		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
-
-		final Set<String> h = Sets.newHashSet();
-		try {
-			for (final String s : IOUtils
-				.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
-				h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
-			}
-		} catch (final Throwable e) {
-			return Sets.newHashSet();
-		}
-		return h;
-	}
-
 	public static Map<String, String> loadMapFromClasspath(final String classpath) {

 		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
@ -303,10 +241,6 @@ public class AbstractPaceFunctions {
 		return StringUtils.substring(s, 0, 1).toLowerCase();
 	}

-	protected static Iterable<String> tokens(final String s, final int maxTokens) {
-		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
-	}
-
 	public static String normalizePid(String pid) {
 		return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
 	}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)

  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }

  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@ -1,19 +1,20 @@

 package eu.dnetlib.pace.tree;

-import com.wcohen.ss.AbstractStringDistance;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Person;
-import eu.dnetlib.pace.tree.support.AbstractListComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-import eu.dnetlib.pace.util.AuthorMatchers;
-
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.function.BiFunction;
 import java.util.stream.Collectors;

+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.util.AuthorMatchers;
+
@ComparatorClass("authorsMatch")
 public class AuthorsMatch extends AbstractListComparator {

--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    RowEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    ExpressionEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -7,6 +7,7 @@ import java.util.HashMap;
 import java.util.Map;

 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 import eu.dnetlib.pace.model.Person;
@ -22,7 +23,7 @@ public class UtilTest {
 	}

 	@Test
-	@Ignore
+	@Disabled
 	public void paceResolverTest() {
 		PaceResolver paceResolver = new PaceResolver();
 		paceResolver.getComparator("keywordMatch", params);
--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <parent>
+    <artifactId>dhp</artifactId>
+    <groupId>eu.dnetlib.dhp</groupId>
+    <version>1.2.5-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <artifactId>dhp-shade-package</artifactId>
+  <description>This module create a jar of all module dependencies</description>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <transformers>
+                <transformer>
+                  <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                </transformer>
+                <transformer />
+                <transformer>
+                  <resource>META-INF/cxf/bus-extensions.txt</resource>
+                </transformer>
+              </transformers>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/maven/**</exclude>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <relocations>
+                <relocation>
+                  <pattern>com</pattern>
+                  <shadedPattern>repackaged.com.google.common</shadedPattern>
+                  <includes>
+                    <include>com.google.common.**</include>
+                  </includes>
+                </relocation>
+              </relocations>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.projectlombok</groupId>
+      <artifactId>lombok</artifactId>
+      <version>1.18.28</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter</artifactId>
+      <version>5.6.1</version>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>junit-jupiter-api</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>junit-jupiter-params</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>junit-jupiter-engine</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>3.3.3</version>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>byte-buddy</artifactId>
+          <groupId>net.bytebuddy</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>byte-buddy-agent</artifactId>
+          <groupId>net.bytebuddy</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-junit-jupiter</artifactId>
+      <version>3.3.3</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <distributionManagement>
+    <site>
+      <id>DHPSite</id>
+      <url>${dhp.site.stage.path}/dhp-common</url>
+    </site>
+  </distributionManagement>
+</project>
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>dhp</artifactId>
+        <version>1.2.5-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+
+    </parent>
+
+    <artifactId>dhp-shade-package</artifactId>
+    <packaging>jar</packaging>
+
+    <distributionManagement>
+        <site>
+            <id>DHPSite</id>
+            <url>${dhp.site.stage.path}/dhp-common</url>
+        </site>
+    </distributionManagement>
+
+    <description>This module create a jar of all module dependencies</description>
+
+
+    <dependencies>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-actionmanager</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-blacklist</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-broker-events</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-dedup-openaire</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-enrichment</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-mapper</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-provision</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-impact-indicators</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-actionsets</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-hist-snaps</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-monitor-irish</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-promote</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-swh</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-raw-data-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-stats-build</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <transformers>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                                </transformer>
+                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/maven/**</exclude>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com</pattern>
+                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
+                                    <includes>
+                                        <include>com.google.common.**</include>
+                                    </includes>
+                                </relocation>
+                            </relocations>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,6 +157,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -125,6 +125,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -103,11 +103,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -156,11 +157,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -95,11 +95,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=10000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -9,6 +9,7 @@ import java.util.List;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
 					.union(openAPCRelations)
 					.union(dataciteRelations)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);

 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -80,9 +80,11 @@ public class PrepareFOSSparkJob implements Serializable {

 		fosDataset
 			.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
-			.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
-				return getResult(ModelSupport.getIdPrefix(Result.class) + "|" + k, it);
-			}, Encoders.bean(Result.class))
+			.mapGroups(
+				(MapGroupsFunction<String, FOSDataModel, Result>) (k,
+					it) -> getResult(
+						ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, it),
+				Encoders.bean(Result.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
@ -113,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
 			.forEach(
 				l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
 		r.setSubject(sbjs);
-		r
-			.setDataInfo(
-				OafMapperUtils
-					.dataInfo(
-						false, null, true,
-						false,
-						OafMapperUtils
-							.qualifier(
-								ModelConstants.PROVENANCE_ENRICH,
-								null,
-								ModelConstants.DNET_PROVENANCE_ACTIONS,
-								ModelConstants.DNET_PROVENANCE_ACTIONS),
-						null));
+
 		return r;
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
 						s -> sbjs
 							.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
 				r.setSubject(sbjs);
-				r
-					.setDataInfo(
-						OafMapperUtils
-							.dataInfo(
-								false, null, true,
-								false,
-								OafMapperUtils
-									.qualifier(
-										ModelConstants.PROVENANCE_ENRICH,
-										null,
-										ModelConstants.DNET_PROVENANCE_ACTIONS,
-										ModelConstants.DNET_PROVENANCE_ACTIONS),
-								null));
+
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/CreateActionSetSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/CreateActionSetSparkJob.java
@ -93,7 +93,7 @@ public class CreateActionSetSparkJob implements Serializable {
 			.filter((FilterFunction<Relation>) Objects::nonNull)
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p));
-//TODO relations in stand-by waiting to know if we need to create them or not In case we need just make a union before saving the sequence file
+
 		spark
 			.read()
 			.textFile(inputPath)
@ -108,6 +108,7 @@ public class CreateActionSetSparkJob implements Serializable {
 			.filter((FilterFunction<Result>) r -> r != null)
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
+			.union(relations)
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@ -0,0 +1,251 @@
+
+package eu.dnetlib.dhp.actionmanager.webcrawl;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.StructType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
+import eu.dnetlib.dhp.schema.oaf.utils.PidType;
+import scala.Tuple2;
+
+/**
+ * @author miriam.baglioni
+ * @Date 18/04/24
+ */
+public class CreateActionSetFromWebEntries implements Serializable {
+	private static final Logger log = LoggerFactory.getLogger(CreateActionSetFromWebEntries.class);
+	private static final String DOI_PREFIX = "50|doi_________::";
+
+	private static final String ROR_PREFIX = "20|ror_________::";
+
+	private static final String PMID_PREFIX = "50|pmid________::";
+
+	private static final String PMCID_PREFIX = "50|pmc_________::";
+	private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
+	private static final String WEB_CRAWL_NAME = "Web Crawl";
+	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static void main(String[] args) throws Exception {
+		String jsonConfiguration = IOUtils
+			.toString(
+				CreateActionSetFromWebEntries.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = parser.get("sourcePath");
+		log.info("inputPath: {}", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath: {}", outputPath);
+
+		final String blackListInputPath = parser.get("blackListPath");
+		log.info("blackListInputPath: {}", blackListInputPath);
+
+		SparkConf conf = new SparkConf();
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+
+				createActionSet(spark, inputPath, outputPath, blackListInputPath);
+
+			});
+	}
+
+	public static void createActionSet(SparkSession spark, String inputPath,
+		String outputPath, String blackListInputPath) {
+
+		final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
+			.filter("country_code=='IE'")
+			.drop("publication_year");
+
+		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
+
+		dataset
+			.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
+			.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
+			.drop("OpenAlexId")
+			.flatMap((FlatMapFunction<Row, Relation>) row -> {
+				List<Relation> ret = new ArrayList<>();
+				final String ror = ROR_PREFIX
+					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
+				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
+				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+
+				return ret
+					.iterator();
+			}, Encoders.bean(Relation.class))
+			.toJavaRDD()
+			.map(p -> new AtomicAction(p.getClass(), p))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+
+	}
+
+	private static Dataset<Row> readWebCrawl(SparkSession spark, String inputPath) {
+		StructType webInfo = StructType
+			.fromDDL(
+				"`id` STRING , `doi` STRING, `ids` STRUCT<`pmid` :STRING, `pmcid`: STRING >, `publication_year` STRING, "
+					+
+					"`authorships` ARRAY<STRUCT <`institutions`: ARRAY <STRUCT <`ror`: STRING, `country_code` :STRING>>>>");
+
+		return spark
+			.read()
+			.schema(webInfo)
+			.json(inputPath)
+			.withColumn(
+				"authors", functions
+					.explode(
+						functions.col("authorships")))
+			.selectExpr("id", "doi", "ids", "publication_year", "authors.institutions as institutions")
+			.withColumn(
+				"institution", functions
+					.explode(
+						functions.col("institutions")))
+			.selectExpr(
+				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
+				"institution.country_code as country_code", "publication_year")
+			.distinct();
+
+	}
+
+	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
+
+		return spark
+			.read()
+			.option("header", true)
+			.csv(inputPath)
+			.select("OpenAlexId");
+	}
+
+	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
+		if (pmcid == null)
+			return new ArrayList<>();
+
+		return createAffiliatioRelationPair(
+			PMCID_PREFIX
+				+ IdentifierFactory
+					.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC", pmcid))),
+			ror);
+	}
+
+	private static List<Relation> createAffiliationRelationPairPMID(String pmid, String ror) {
+		if (pmid == null)
+			return new ArrayList<>();
+
+		return createAffiliatioRelationPair(
+			PMID_PREFIX
+				+ IdentifierFactory
+					.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
+			ror);
+	}
+
+	private static String removeResolver(String pidType, String pid) {
+		switch (pidType) {
+			case "PMID":
+				return pid.substring(33);
+			case "PMC":
+				return "PMC" + pid.substring(43);
+			case "DOI":
+				return pid.substring(16);
+		}
+
+		throw new RuntimeException();
+
+	}
+
+	private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
+		if (doi == null)
+			return new ArrayList<>();
+
+		return createAffiliatioRelationPair(
+			DOI_PREFIX
+				+ IdentifierFactory
+					.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI", doi))),
+			ror);
+
+	}
+
+	private static List<Relation> createAffiliatioRelationPair(String resultId, String orgId) {
+		ArrayList<Relation> newRelations = new ArrayList();
+
+		newRelations
+			.add(
+				OafMapperUtils
+					.getRelation(
+						orgId, resultId, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION,
+						ModelConstants.IS_AUTHOR_INSTITUTION_OF,
+						Arrays
+							.asList(
+								OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
+						OafMapperUtils
+							.dataInfo(
+								false, null, false, false,
+								OafMapperUtils
+									.qualifier(
+										"sysimport:crasswalk:webcrawl", "Imported from Webcrawl",
+										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+								"0.9"),
+						null));
+
+		newRelations
+			.add(
+				OafMapperUtils
+					.getRelation(
+						resultId, orgId, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION,
+						ModelConstants.HAS_AUTHOR_INSTITUTION,
+						Arrays
+							.asList(
+								OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
+						OafMapperUtils
+							.dataInfo(
+								false, null, false, false,
+								OafMapperUtils
+									.qualifier(
+										"sysimport:crasswalk:webcrawl", "Imported from Webcrawl",
+										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+								"0.9"),
+						null));
+
+		return newRelations;
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -1,6 +1,7 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;

 import org.apache.commons.lang3.StringUtils;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
+		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+		Gson gson = new Gson();
+		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
-		if (StringUtils.isBlank(queryParams)) {
-			throw new CollectorException("Param 'queryParams' is null or empty");
-		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
+			requestHeaders);

 		return StreamSupport
 			.stream(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -9,8 +9,11 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@ -22,14 +25,14 @@ import javax.xml.xpath.*;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.http.HttpHeaders;
-import org.apache.http.entity.ContentType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;

+import com.google.common.collect.Maps;
+
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;
@ -44,23 +47,28 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 *
 */
 public class RestIterator implements Iterator<String> {
-
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
+	private static final int MAX_ATTEMPTS = 5;

 	private final HttpClientParams clientParams;

-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
+
+	private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+	private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
+		+ ">";

 	private final String baseUrl;
 	private final String resumptionType;
 	private final String resumptionParam;
 	private final String resultFormatValue;
-	private String queryParams;
+	private String queryParams = "";
 	private final int resultSizeValue;
 	private int resumptionInt = 0; // integer resumption token (first record to harvest)
 	private int resultTotal = -1;
-	private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
+	private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
+																			// harvest
 	// or token scanned from results)
 	private InputStream resultStream;
 	private Transformer transformer;
@ -73,7 +81,7 @@ public class RestIterator implements Iterator<String> {
 	private final String querySize;
 	private final String authMethod;
 	private final String authToken;
-	private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
+	private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
 	private int discoverResultSize = 0;
 	private int pagination = 1;
 	/*
@ -83,8 +91,13 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;

-	/** RestIterator class
-	 *  compatible to version 1.3.33
+	/*
+	 * Can be used to set additional request headers, like for content negotiation
+	 */
+	private Map<String, String> requestHeaders;
+
+	/**
+	 * RestIterator class compatible to version 1.3.33
 	 */
 	public RestIterator(
 		final HttpClientParams clientParams,
@ -101,47 +114,56 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
+		final Map<String, String> requestHeaders) {

 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
 		this.resumptionType = resumptionType;
 		this.resumptionParam = resumptionParam;
 		this.resultFormatValue = resultFormatValue;
-		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+		this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
 		this.queryParams = queryParams;
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
+		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();

-		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+			: "";
+		this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
 			: "";
-		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";

 		try {
 			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
-		} catch (Exception e) {
+		} catch (final Exception e) {
 			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
 		}

 		initQueue();
 	}

-	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+	private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
+		final String entityXpath)
 		throws TransformerConfigurationException, XPathExpressionException {
 		final TransformerFactory factory = TransformerFactory.newInstance();
-		transformer = factory.newTransformer();
-		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
-		xpath = XPathFactory.newInstance().newXPath();
-		xprResultTotalPath = xpath.compile(resultTotalXpath);
-		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
-		xprEntity = xpath.compile(entityXpath);
+		this.transformer = factory.newTransformer();
+		this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+		this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+		this.xpath = XPathFactory.newInstance().newXPath();
+		this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
+		this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+		this.xprEntity = this.xpath.compile(entityXpath);
 	}

 	private void initQueue() {
+		if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
+			query = baseUrl;
+		} else {
 			query = baseUrl + "?" + queryParams + querySize + queryFormat;
-		log.info("REST calls starting with {}", query);
+		}
+
+		log.info("REST calls starting with {}", this.query);
 	}

 	private void disconnect() {
@ -154,11 +176,22 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public boolean hasNext() {
-		if (recordQueue.isEmpty() && query.isEmpty()) {
+		synchronized (this.recordQueue) {
+			while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
+				try {
+					this.query = downloadPage(this.query, 0);
+				} catch (final CollectorException e) {
+					log.debug("CollectorPlugin.next()-Exception: {}", e);
+					throw new RuntimeException(e);
+				}
+			}
+
+			if (!this.recordQueue.isEmpty()) {
+				return true;
+			}
+
 			disconnect();
 			return false;
-		} else {
-			return true;
 		}
 	}

@ -168,27 +201,34 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public String next() {
-		synchronized (recordQueue) {
-			while (recordQueue.isEmpty() && !query.isEmpty()) {
-				try {
-					query = downloadPage(query);
-				} catch (CollectorException e) {
-					log.debug("CollectorPlugin.next()-Exception: {}", e);
-					throw new RuntimeException(e);
-				}
-			}
-			return recordQueue.poll();
+		synchronized (this.recordQueue) {
+			return this.recordQueue.poll();
 		}
 	}

 	/*
-	 * download page and return nextQuery
+	 * download page and return nextQuery (with number of attempt)
 	 */
-	private String downloadPage(String query) throws CollectorException {
+	private String downloadPage(String query, final int attempt) throws CollectorException {
+
+		if (attempt > MAX_ATTEMPTS) {
+			throw new CollectorException("Max Number of attempts reached, query:" + query);
+		}
+
+		if (attempt > 0) {
+			final int delay = (attempt * 5000);
+			log.debug("Attempt {} with delay {}", attempt, delay);
+			try {
+				Thread.sleep(delay);
+			} catch (final InterruptedException e) {
+				new CollectorException(e);
+			}
+		}
+
+		try {
 			String resultJson;
-		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+			String resultXml = XML_HEADER;
 			String nextQuery = "";
-		String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
 			Node resultNode = null;
 			NodeList nodeList = null;
 			String qUrlArgument = "";
@ -196,81 +236,96 @@ public class RestIterator implements Iterator<String> {
 			InputStream theHttpInputStream;

 			// check if cursor=* is initial set otherwise add it to the queryParam URL
-		if (resumptionType.equalsIgnoreCase("deep-cursor")) {
+			if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
 				log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
 				if (!query.contains("&cursor=")) {
 					query += "&cursor=*";
 				}
 			}

+			// find pagination page start number in queryParam and remove before start the first query
+			if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
+				&& (query.contains("paginationStart="))) {
+
+				final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
+				m.find(); // guaranteed to be true for this regex
+
+				String[] pageVal = m.group(0).split("=");
+				pagination = Integer.parseInt(pageVal[1]);
+
+				// remove page start number from query and queryParams
+				queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
+				query = query.replaceFirst("&?paginationStart=[0-9]+", "");
+
+			}
+
 			try {
-			log.info("requestig URL [{}]", query);
+				log.info("requesting URL [{}]", query);

-			URL qUrl = new URL(query);
-			log.debug("authMethod: {}", authMethod);
-			if ("bearer".equalsIgnoreCase(this.authMethod)) {
-				log.trace("authMethod before inputStream: {}", resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
-				conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
-				conn.setRequestMethod("GET");
-				theHttpInputStream = conn.getInputStream();
-			} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
-				log.trace("authMethod before inputStream: {}", resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
-				conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
-				conn.setRequestMethod("GET");
-				theHttpInputStream = conn.getInputStream();
-			} else {
-				theHttpInputStream = qUrl.openStream();
+				final URL qUrl = new URL(query);
+				log.debug("authMethod: {}", this.authMethod);
+				if (this.authMethod == "bearer") {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Bearer " + authToken);
+					// requestHeaders.put("Content-Type", "application/json");
+				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Basic " + authToken);
+					// requestHeaders.put("accept", "application/xml");
 				}
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestMethod("GET");
+				this.setRequestHeader(conn);
+				resultStream = conn.getInputStream();

-			resultStream = theHttpInputStream;
-			if ("json".equals(resultOutputFormat)) {
-				resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
+				if ("json".equals(this.resultOutputFormat)) {
+					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
 					resultXml = JsonUtils.convertToXML(resultJson);
-				resultStream = IOUtils.toInputStream(resultXml, UTF_8);
+					this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
 				}

-			if (!(emptyXml).equalsIgnoreCase(resultXml)) {
-				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
-				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
+				if (!isEmptyXml(resultXml)) {
+					resultNode = (Node) this.xpath
+						.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
+					nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
 					log.debug("nodeList.length: {}", nodeList.getLength());
 					for (int i = 0; i < nodeList.getLength(); i++) {
-					StringWriter sw = new StringWriter();
-					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
-					String toEnqueue = sw.toString();
-					if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
-						log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
+						final StringWriter sw = new StringWriter();
+						this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
+						final String toEnqueue = sw.toString();
+						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
+							log
+								.warn(
+									"The following record resulted in empty item for the feeding queue: {}", resultXml);
 						} else {
-						recordQueue.add(sw.toString());
+							this.recordQueue.add(sw.toString());
 						}
 					}
 				} else {
 					log.warn("resultXml is equal with emptyXml");
 				}

-			resumptionInt += resultSizeValue;
+				this.resumptionInt += this.resultSizeValue;

-			switch (resumptionType.toLowerCase()) {
+				switch (this.resumptionType.toLowerCase()) {
 					case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
-					resumptionStr = xprResumptionPath.evaluate(resultNode);
+						this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
 						break;

 					case "count": // begin at one step for all records, iterate over items
-					resumptionStr = Integer.toString(resumptionInt);
+						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;

 					case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
-					if (resultSizeValue < 2) {
+						if (this.resultSizeValue < 2) {
 							throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
 						}
 						qUrlArgument = qUrl.getQuery();
-					String[] arrayQUrlArgument = qUrlArgument.split("&");
-					for (String arrayUrlArgStr : arrayQUrlArgument) {
-						if (arrayUrlArgStr.startsWith(resumptionParam)) {
-							String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+
+						final String[] arrayQUrlArgument = qUrlArgument.split("&");
+						for (final String arrayUrlArgStr : arrayQUrlArgument) {
+							if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
+								final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
 								if (isInteger(resumptionKeyValue[1])) {
 									urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
 									log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
@ -280,61 +335,63 @@ public class RestIterator implements Iterator<String> {
 							}
 						}

-					if (((emptyXml).equalsIgnoreCase(resultXml))
-						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
+						if (isEmptyXml(resultXml)
+							|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
 							// resumptionStr = "";
 							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
-						resultTotal = discoverResultSize;
+							this.resultTotal = this.discoverResultSize;
 						} else {
-						resumptionStr = Integer.toString(resumptionInt);
-						resultTotal = resumptionInt + 1;
+							this.resumptionStr = Integer.toString(this.resumptionInt);
+							this.resultTotal = this.resumptionInt + 1;
 							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
 						}
-					log.info("discoverResultSize: {}", discoverResultSize);
+						log.info("discoverResultSize: {}", this.discoverResultSize);
 						break;

 					case "pagination":
 					case "page": // pagination, iterate over page numbers
-					pagination += 1;
-					if (nodeList != null) {
-						discoverResultSize += nodeList.getLength();
+						if (nodeList != null && nodeList.getLength() > 0) {
+							this.discoverResultSize += nodeList.getLength();
 						} else {
-						resultTotal = discoverResultSize;
-						pagination = discoverResultSize;
+							this.resultTotal = this.discoverResultSize;
+							this.pagination = this.discoverResultSize;
 						}
-					resumptionInt = pagination;
-					resumptionStr = Integer.toString(resumptionInt);
+						this.pagination += 1;
+						this.resumptionInt = this.pagination;
+						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;

-				case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
+					case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
+										// in
 										// solr)
 						// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
 						// deep-cursor, Param 'resultSizeValue' is less than 2");}

-					resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
-					queryParams = queryParams.replace("&cursor=*", "");
+						this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
+						this.queryParams = this.queryParams.replace("&cursor=*", "");

 						// terminating if length of nodeList is 0
-					if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
-						resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
+						if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
+							this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
 						} else {
-						resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
+							this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
+																									// resultSizeValue
 							// because the iteration is over
 							// real length and the
 							// resultSizeValue is added before
 							// the switch()
 						}

-					discoverResultSize = nodeList.getLength();
+						this.discoverResultSize = nodeList.getLength();

 						log
 							.debug(
-							"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
-								+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
+								"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
+									+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);

 						break;

@ -343,28 +400,30 @@ public class RestIterator implements Iterator<String> {
 						break;
 				}

-		} catch (Exception e) {
+			} catch (final Exception e) {
 				log.error(e.getMessage(), e);
 				throw new IllegalStateException("collection failed: " + e.getMessage());
 			}

 			try {
-			if (resultTotal == -1) {
-				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
-				if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
-					resultTotal += 1;
+				if (this.resultTotal == -1) {
+					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
+					if ("page".equalsIgnoreCase(this.resumptionType)
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+						this.resultTotal += 1;
 					} // to correct the upper bound
-				log.info("resultTotal was -1 is now: " + resultTotal);
+					log.info("resultTotal was -1 is now: " + this.resultTotal);
 				}
-		} catch (Exception e) {
+			} catch (final Exception e) {
 				log.error(e.getMessage(), e);
 				throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
 			}
-		log.debug("resultTotal: " + resultTotal);
-		log.debug("resInt: " + resumptionInt);
-		if (resumptionInt <= resultTotal) {
-			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
-				+ queryFormat;
+			log.debug("resultTotal: " + this.resultTotal);
+			log.debug("resInt: " + this.resumptionInt);
+			if (this.resumptionInt <= this.resultTotal) {
+				nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
+					+ this.resumptionStr
+					+ this.queryFormat;
 			} else {
 				nextQuery = "";
 				// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
@ -372,10 +431,18 @@ public class RestIterator implements Iterator<String> {
 			}
 			log.debug("nextQueryUrl: " + nextQuery);
 			return nextQuery;
+		} catch (final Throwable e) {
+			log.warn(e.getMessage(), e);
+			return downloadPage(query, attempt + 1);
+		}

 	}

-	private boolean isInteger(String s) {
+	private boolean isEmptyXml(String s) {
+		return EMPTY_XML.equalsIgnoreCase(s);
+	}
+
+	private boolean isInteger(final String s) {
 		boolean isValidInteger = false;
 		try {
 			Integer.parseInt(s);
@ -383,7 +450,7 @@ public class RestIterator implements Iterator<String> {
 			// s is a valid integer

 			isValidInteger = true;
-		} catch (NumberFormatException ex) {
+		} catch (final NumberFormatException ex) {
 			// s is not an integer
 		}

@ -391,20 +458,36 @@ public class RestIterator implements Iterator<String> {
 	}

 	// Method to encode a string value using `UTF-8` encoding scheme
-	private String encodeValue(String value) {
+	private String encodeValue(final String value) {
 		try {
 			return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
-		} catch (UnsupportedEncodingException ex) {
+		} catch (final UnsupportedEncodingException ex) {
 			throw new RuntimeException(ex.getCause());
 		}
 	}

+	/**
+	 * setRequestHeader
+	 *
+	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+	 * @param conn
+	 */
+	private void setRequestHeader(HttpURLConnection conn) {
+		if (requestHeaders != null) {
+			for (String key : requestHeaders.keySet()) {
+				conn.setRequestProperty(key, requestHeaders.get(key));
+			}
+			log.debug("Set Request Header with: " + requestHeaders);
+		}
+
+	}
+
 	public String getResultFormatValue() {
-		return resultFormatValue;
+		return this.resultFormatValue;
 	}

 	public String getResultOutputFormat() {
-		return resultOutputFormat;
+		return this.resultOutputFormat;
 	}

 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -8,7 +8,10 @@ import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
 import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;

 import javax.xml.stream.XMLEventFactory;
 import javax.xml.stream.XMLEventReader;
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.StartElement;
 import javax.xml.stream.events.XMLEvent;

+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {

 	private String element;

+	private List<String> elements;
+
 	private InputStream inputStream;

 	public XMLIterator(final String element, final InputStream inputStream) {
 		super();
 		this.element = element;
+		if (element.contains(",")) {
+			elements = Arrays
+				.stream(element.split(","))
+				.filter(StringUtils::isNoneBlank)
+				.map(String::toLowerCase)
+				.collect(Collectors.toList());
+		}
 		this.inputStream = inputStream;
 		this.parser = getParser();
+
 		try {
 			this.current = findElement(parser);
 		} catch (XMLStreamException e) {
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
 				final XMLEvent event = parser.nextEvent();

 				// TODO: replace with depth tracking instead of close tag tracking.
-				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+				if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
 					writer.add(event);
 					break;
 				}
@ -142,31 +156,48 @@ public class XMLIterator implements Iterator<String> {
 		XMLEvent peek = parser.peek();
 		if (peek != null && peek.isStartElement()) {
 			String name = peek.asStartElement().getName().getLocalPart();
-			if (element.equals(name)) {
+			if (isCheckTag(name))
 				return peek;
 		}
-		}

 		while (parser.hasNext()) {
-			final XMLEvent event = parser.nextEvent();
+			XMLEvent event = parser.nextEvent();
 			if (event != null && event.isStartElement()) {
 				String name = event.asStartElement().getName().getLocalPart();
-				if (element.equals(name)) {
+				if (isCheckTag(name))
 					return event;
 			}
 		}
-		}
 		return null;
 	}

 	private XMLEventReader getParser() {
 		try {
-			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+			XMLInputFactory xif = inputFactory.get();
+			xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+			return xif.createXMLEventReader(sanitize(inputStream));
 		} catch (XMLStreamException e) {
 			throw new RuntimeException(e);
 		}
 	}

+	private boolean isCheckTag(final String tagName) {
+		if (elements != null) {
+			final String found = elements
+				.stream()
+				.filter(e -> e.equalsIgnoreCase(tagName))
+				.findFirst()
+				.orElse(null);
+			if (found != null)
+				return true;
+		} else {
+			if (element.equalsIgnoreCase(tagName)) {
+				return true;
+			}
+		}
+		return false;
+	}
+
 	private Reader sanitize(final InputStream in) {
 		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
 		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@ -0,0 +1,25 @@
+[
+  {
+    "paramName": "sp",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the zipped opencitations file",
+    "paramRequired": true
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "the working path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },{
+  "paramName": "bl",
+  "paramLongName": "blackListPath",
+  "paramDescription": "the working path",
+  "paramRequired": true
+}
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@ -0,0 +1,3 @@
+sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
+outputPath=/tmp/miriam/webcrawlComplete/
+blackListPath=/user/miriam.baglioni/openalex-blackList
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/config-default.xml
@ -0,0 +1,58 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>sparkExecutorNumber</name>
+        <value>4</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>sparkDriverMemory</name>
+        <value>15G</value>
+    </property>
+    <property>
+        <name>sparkExecutorMemory</name>
+        <value>6G</value>
+    </property>
+    <property>
+        <name>sparkExecutorCores</name>
+        <value>1</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@ -0,0 +1,54 @@
+<workflow-app name="WebCrawl Integration" xmlns="uri:oozie:workflow:0.5">
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+
+        </configuration>
+    </global>
+
+    <start to="create_actionset"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="create_actionset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the AS for WC</name>
+            <class>eu.dnetlib.dhp.actionmanager.webcrawl.CreateActionSetFromWebEntries</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--blackListPath</arg><arg>${blackListPath}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -58,7 +58,7 @@
    "uri": "http://dx.doi.org/10.13039/100010414",
    "name": "Health Research Board",
    "synonym": [
-      "501100001590"
+      "501100001590", "501100023273"
    ]
  },
  {
@ -85,24 +85,6 @@
    "name": "Irish College of General Practitioners",
    "synonym": []
  },
-  {
-    "id": "100012734",
-    "uri": "http://dx.doi.org/10.13039/100012734",
-    "name": "Department for Culture, Heritage and the Gaeltacht, Ireland",
-    "synonym": []
-  },
-  {
-    "id": "100012754",
-    "uri": "http://dx.doi.org/10.13039/100012754",
-    "name": "Horizon Pharma",
-    "synonym": []
-  },
-  {
-    "id": "100012891",
-    "uri": "http://dx.doi.org/10.13039/100012891",
-    "name": "Medical Research Charities Group",
-    "synonym": []
-  },
  {
    "id": "100012919",
    "uri": "http://dx.doi.org/10.13039/100012919",
@ -233,7 +215,7 @@
    "id": "100018064",
    "uri": "http://dx.doi.org/10.13039/100018064",
    "name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
-    "synonym": []
+    "synonym": ["100012734"]
  },
  {
    "id": "100018172",
@ -271,12 +253,6 @@
    "name": "An Roinn Sl\u00e1inte",
    "synonym": []
  },
-  {
-    "id": "100018998",
-    "uri": "http://dx.doi.org/10.13039/100018998",
-    "name": "Irish Research eLibrary",
-    "synonym": []
-  },
  {
    "id": "100019428",
    "uri": "http://dx.doi.org/10.13039/100019428",
@ -287,13 +263,13 @@
    "id": "100019637",
    "uri": "http://dx.doi.org/10.13039/100019637",
    "name": "Horizon Therapeutics",
-    "synonym": []
+    "synonym": ["100012754"]
  },
  {
    "id": "100020174",
    "uri": "http://dx.doi.org/10.13039/100020174",
    "name": "Health Research Charities Ireland",
-    "synonym": []
+    "synonym": ["100012891"]
  },
  {
    "id": "100020202",
@ -325,12 +301,7 @@
    "name": "Centre for Ageing Research and Development in Ireland",
    "synonym": []
  },
-  {
-    "id": "501100001583",
-    "uri": "http://dx.doi.org/10.13039/501100001583",
-    "name": "Cystinosis Foundation Ireland",
-    "synonym": []
-  },
+
  {
    "id": "501100001584",
    "uri": "http://dx.doi.org/10.13039/501100001584",
@ -527,7 +498,7 @@
    "id": "501100003037",
    "uri": "http://dx.doi.org/10.13039/501100003037",
    "name": "Elan",
-    "synonym": []
+    "synonym": ["501100021694"]
  },
  {
    "id": "501100003496",
@ -601,17 +572,11 @@
    "name": "Technological University Dublin",
    "synonym": []
  },
-  {
-    "id": "501100009269",
-    "uri": "http://dx.doi.org/10.13039/501100009269",
-    "name": "Programme of Competitive Forestry Research for Development",
-    "synonym": []
-  },
  {
    "id": "501100009315",
    "uri": "http://dx.doi.org/10.13039/501100009315",
    "name": "Cystinosis Ireland",
-    "synonym": []
+    "synonym": ["501100001583"]
  },
  {
    "id": "501100010808",
@ -631,12 +596,6 @@
    "name": "Alimentary Health",
    "synonym": []
  },
-  {
-    "id": "501100011103",
-    "uri": "http://dx.doi.org/10.13039/501100011103",
-    "name": "Rann\u00eds",
-    "synonym": []
-  },
  {
    "id": "501100012354",
    "uri": "http://dx.doi.org/10.13039/501100012354",
@ -739,12 +698,6 @@
    "name": "Insight SFI Research Centre for Data Analytics",
    "synonym": []
  },
-  {
-    "id": "501100021694",
-    "uri": "http://dx.doi.org/10.13039/501100021694",
-    "name": "Elan Pharma International",
-    "synonym": []
-  },
  {
    "id": "501100021838",
    "uri": "http://dx.doi.org/10.13039/501100021838",
@ -775,12 +728,6 @@
    "name": "Institute of Technology, Tralee",
    "synonym": []
  },
-  {
-    "id": "501100023273",
-    "uri": "http://dx.doi.org/10.13039/501100023273",
-    "name": "HRB Clinical Research Facility Galway",
-    "synonym": []
-  },
  {
    "id": "501100023378",
    "uri": "http://dx.doi.org/10.13039/501100023378",
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
+              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@ -5,7 +5,17 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
 import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
-import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, Journal, Organization, Publication, Relation, Result, Dataset => OafDataset}
+import eu.dnetlib.dhp.schema.oaf.{
+  Author,
+  DataInfo,
+  Instance,
+  Journal,
+  Organization,
+  Publication,
+  Relation,
+  Result,
+  Dataset => OafDataset
+}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
@ -69,23 +79,6 @@ object MagUtility extends Serializable {
  private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)

  private val MAGDataInfo: DataInfo = {
-    val di = new DataInfo
-    di.setDeletedbyinference(false)
-    di.setInferred(false)
-    di.setInvisible(false)
-    di.setTrust("0.9")
-    di.setProvenanceaction(
-      OafMapperUtils.qualifier(
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.DNET_PROVENANCE_ACTIONS,
-        ModelConstants.DNET_PROVENANCE_ACTIONS
-      )
-    )
-    di
-  }
-
-  private val MAGDataInfoInvisible: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
@ -443,7 +436,6 @@ object MagUtility extends Serializable {

      case "repository" =>
        result = new Publication()
-        result.setDataInfo(MAGDataInfoInvisible)
        qualifier(
          "0038",
          "Other literature type",
@ -478,7 +470,6 @@ object MagUtility extends Serializable {
    }

    if (result != null) {
-      if (result.getDataInfo == null)
      result.setDataInfo(MAGDataInfo)
      val i = new Instance
      i.setInstancetype(tp)
@ -502,7 +493,7 @@ object MagUtility extends Serializable {
      return null

    result.setCollectedfrom(List(MAGCollectedFrom).asJava)
-    val pidList = List(
+    var pidList = List(
      structuredProperty(
        paper.paperId.get.toString,
        qualifier(
@ -515,8 +506,6 @@ object MagUtility extends Serializable {
      )
    )

-    result.setPid(pidList.asJava)
-
    result.setOriginalId(pidList.map(s => s.getValue).asJava)

    result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@ -608,10 +597,9 @@ object MagUtility extends Serializable {
    }

    val instance = result.getInstance().get(0)
-    instance.setPid(pidList.asJava)
-    if (paper.doi.orNull != null)
-      instance.setAlternateIdentifier(
-        List(
+
+    if (paper.doi.orNull != null) {
+      pidList = pidList ::: List(
        structuredProperty(
          paper.doi.get,
          qualifier(
@ -622,8 +610,10 @@ object MagUtility extends Serializable {
          ),
          null
        )
-        ).asJava
      )
+    }
+    instance.setPid(pidList.asJava)
+    result.setPid(pidList.asJava)
    instance.setUrl(paper.urls.get.asJava)
    instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
    instance.setCollectedfrom(MAGCollectedFrom)
@ -688,33 +678,45 @@ object MagUtility extends Serializable {
    o.setLegalname(field(r.getAs[String]("DisplayName"), null))
    val gid = r.getAs[String]("GridId")
    if (gid != null) {
-      o.setPid(List(
-        structuredProperty(gid, qualifier(
+      o.setPid(
+        List(
+          structuredProperty(
+            gid,
+            qualifier(
              PidType.GRID.toString,
              PidType.GRID.toString,
              ModelConstants.DNET_PID_TYPES,
              ModelConstants.DNET_PID_TYPES
            ),
-          null),
-        structuredProperty(r.getAs[Long]("AffiliationId").toString, qualifier(
+            null
+          ),
+          structuredProperty(
+            r.getAs[Long]("AffiliationId").toString,
+            qualifier(
              PidType.mag_id.toString,
              PidType.mag_id.toString,
              ModelConstants.DNET_PID_TYPES,
              ModelConstants.DNET_PID_TYPES
            ),
-          null)
-
-      ).asJava)
+            null
+          )
+        ).asJava
+      )
    } else {
-      o.setPid(List(
-        structuredProperty(r.getAs[Long]("AffiliationId").toString, qualifier(
+      o.setPid(
+        List(
+          structuredProperty(
+            r.getAs[Long]("AffiliationId").toString,
+            qualifier(
              PidType.mag_id.toString,
              PidType.mag_id.toString,
              ModelConstants.DNET_PID_TYPES,
              ModelConstants.DNET_PID_TYPES
            ),
-          null)
-      ).asJava)
+            null
+          )
+        ).asJava
+      )
    }
    val c = r.getAs[String]("Iso3166Code")
    if (c != null)
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
    spark.read
      .load(s"$magBasePath/mag_denormalized")
      .as[MAGPaper]
+      .filter(col("doi").isNotNull)
      .map(s => MagUtility.convertMAGtoOAF(s))
      .filter(s => s != null)
      .write
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMagOrganizationAS.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMagOrganizationAS.scala
@ -24,7 +24,10 @@ class SparkMagOrganizationAS (propertyPath: String, args: Array[String], log: Lo
  def generateAS(spark: SparkSession, magBasePath: String, outputPath: String): Unit = {
    import spark.implicits._
    val organizations = MagUtility.loadMagEntity(spark, "Affiliations", magBasePath)
-    organizations.map(r => MagUtility.generateOrganization(r)).write.mode(SaveMode.Overwrite)
+    organizations
+      .map(r => MagUtility.generateOrganization(r))
+      .write
+      .mode(SaveMode.Overwrite)
      .option("compression", "gzip")
      .text(outputPath)
  }
@ -33,6 +36,7 @@ class SparkMagOrganizationAS (propertyPath: String, args: Array[String], log: Lo
 object SparkMagOrganizationAS {

  val log: Logger = LoggerFactory.getLogger(SparkMagOrganizationAS.getClass)
+
  def main(args: Array[String]): Unit = {
    new SparkMagOrganizationAS("/eu/dnetlib/dhp/collection/mag/create_organization_AS.json", args, log)
      .initialize()
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.pubmed._
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}

-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory

 object SparkCreateBaselineDataFrame {

@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
+        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)

-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
-    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
-    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
-    log.info("outputBasePath: {}", outputBasePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)

    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)

    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )

-    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
-    val mdStoreSize = df.count
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed

 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}

 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
+					"-format",
+					"COCI"
 				});

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@ -0,0 +1,297 @@
+
+package eu.dnetlib.dhp.actionmanager.webcrawl;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
+import eu.dnetlib.dhp.schema.oaf.utils.PidType;
+
+/**
+ * @author miriam.baglioni
+ * @Date 22/04/24
+ */
+public class CreateASTest {
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static SparkSession spark;
+
+	private static Path workingDir;
+	private static final Logger log = LoggerFactory
+		.getLogger(CreateASTest.class);
+
+	@BeforeAll
+	public static void beforeAll() throws IOException {
+		workingDir = Files
+			.createTempDirectory(CreateASTest.class.getSimpleName());
+		log.info("using work dir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(CreateASTest.class.getSimpleName());
+
+		conf.setMaster("local[*]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(CreateASTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void afterAll() throws IOException {
+		FileUtils.deleteDirectory(workingDir.toFile());
+		spark.stop();
+	}
+
+	@Test
+	void testNumberofRelations() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
+			.getPath();
+		String blackListPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+			.getPath();
+
+		CreateActionSetFromWebEntries
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-sourcePath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet1",
+					"-blackListPath", blackListPath
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		Assertions.assertEquals(58, tmp.count());
+
+	}
+
+	@Test
+	void testRelations() throws Exception {
+
+//		 , "doi":"https://doi.org/10.1126/science.1188021", "pmid":"https://pubmed.ncbi.nlm.nih.gov/20448178", https://www.ncbi.nlm.nih.gov/pmc/articles/5100745
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
+			.getPath();
+		String blackListPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+			.getPath();
+
+		CreateActionSetFromWebEntries
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-sourcePath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet1",
+					"-blackListPath", blackListPath
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
+
+		Assertions
+			.assertEquals(
+				1, tmp
+					.filter(
+						r -> r
+							.getSource()
+							.equals(
+								"50|doi_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue(PidType.doi.toString(), "10.1098/rstl.1684.0023"))))
+					.count());
+
+		Assertions
+			.assertEquals(
+				1, tmp
+					.filter(
+						r -> r
+							.getTarget()
+							.equals(
+								"50|doi_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue(PidType.doi.toString(), "10.1098/rstl.1684.0023"))))
+					.count());
+
+		Assertions
+			.assertEquals(
+				1, tmp
+					.filter(
+						r -> r
+							.getSource()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue("ROR", "https://ror.org/03argrj65"))))
+					.count());
+
+		Assertions
+			.assertEquals(
+				1, tmp
+					.filter(
+						r -> r
+							.getTarget()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue("ROR", "https://ror.org/03argrj65"))))
+					.count());
+
+		Assertions
+			.assertEquals(
+				2, tmp
+					.filter(
+						r -> r
+							.getSource()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue("ROR", "https://ror.org/03265fv13"))))
+					.count());
+
+		Assertions
+			.assertEquals(
+				2, tmp
+					.filter(
+						r -> r
+							.getTarget()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue("ROR", "https://ror.org/03265fv13"))))
+					.count());
+
+		Assertions
+			.assertEquals(
+				1, tmp
+					.filter(
+						r -> r
+							.getTarget()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
+							&& r.getSource().startsWith("50|doi"))
+					.count());
+
+		Assertions
+			.assertEquals(
+				1, tmp
+					.filter(
+						r -> r
+							.getTarget()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
+							&& r.getSource().startsWith("50|pmid"))
+					.count());
+
+		Assertions
+			.assertEquals(
+				0, tmp
+					.filter(
+						r -> r
+							.getTarget()
+							.equals(
+								"20|ror_________::" + IdentifierFactory
+									.md5(
+										PidCleaner
+											.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
+							&& r.getSource().startsWith("50|pmc"))
+					.count());
+	}
+
+	@Test
+	void testRelationsCollectedFrom() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl")
+			.getPath();
+
+		CreateActionSetFromWebEntries
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-sourcePath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet1"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		tmp.foreach(r -> {
+			assertEquals("Web Crawl", r.getCollectedfrom().get(0).getValue());
+			assertEquals("10|openaire____::fb98a192f6a055ba495ef414c330834b", r.getCollectedfrom().get(0).getKey());
+		});
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Objects;
+import java.util.stream.Stream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.junit.jupiter.MockitoExtension;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+@ExtendWith(MockitoExtension.class)
+public class FileGZipMultipleNodeTest {
+
+	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
+
+	private final ApiDescriptor api = new ApiDescriptor();
+
+	private FileGZipCollectorPlugin plugin;
+
+	private static final String SPLIT_ON_ELEMENT = "incollection,article";
+
+	@BeforeEach
+	public void setUp() throws IOException {
+
+		final String gzipFile = Objects
+			.requireNonNull(
+				this
+					.getClass()
+					.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
+			.getFile();
+
+		api.setBaseUrl(gzipFile);
+
+		HashMap<String, String> params = new HashMap<>();
+		params.put("splitOnElement", SPLIT_ON_ELEMENT);
+
+		api.setParams(params);
+
+		FileSystem fs = FileSystem.get(new Configuration());
+		plugin = new FileGZipCollectorPlugin(fs);
+	}
+
+	@Test
+	void test() throws CollectorException {
+
+		final Stream<String> stream = plugin.collect(api, new AggregatorReport());
+
+		stream.limit(10).forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			log.info(s);
+		});
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -1,7 +1,9 @@
+
 package eu.dnetlib.dhp.collection.plugin.rest;

 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Stream;

 import org.junit.jupiter.api.Assertions;
@ -34,11 +36,11 @@ public class OsfPreprintCollectorTest {
 	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";

 	private final String resumptionParam = "page";
-	private final String resumptionType = "page";
-	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+	private final String resumptionType = "scan";
+	private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";

-	private final String resultSizeParam = "";
-	private final String resultSizeValue = "";
+	private final String resultSizeParam = "page[size]";
+	private final String resultSizeValue = "100";

 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -68,11 +70,11 @@ public class OsfPreprintCollectorTest {

 	@Test
 	@Disabled
-	void test() throws CollectorException {
+	void test_limited() throws CollectorException {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());

-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
@ -81,4 +83,23 @@ public class OsfPreprintCollectorTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Test
+	@Disabled
+	void test_all() throws CollectorException {
+		final AtomicLong i = new AtomicLong(0);
+		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
+
+		stream.forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			if ((i.incrementAndGet() % 1000) == 0) {
+				log.info("COLLECTED: {}", i.get());
+			}
+
+		});
+
+		log.info("TOTAL: {}", i.get());
+		Assertions.assertTrue(i.get() > 0);
+	}
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@ -4,6 +4,11 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@ -25,18 +32,18 @@ class RestCollectorPluginTest {

 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);

-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
-	private final String resumptionType = "count";
-	private final String resumptionParam = "from";
-	private final String entityXpath = "//hits/hits";
-	private final String resumptionXpath = "//hits";
-	private final String resultTotalXpath = "//hits/total";
-	private final String resultFormatParam = "format";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+	private final String resumptionType = "discover";
+	private final String resumptionParam = "skip";
+	private final String entityXpath = "//*[local-name()='data']";
+	private final String resumptionXpath = "";
+	private final String resultTotalXpath = "//*[local-name()='count']";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";

 	private final String protocolDescriptor = "rest_json2xml";
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
+		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");

 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Disabled
+	@Test
+	void testUrl() throws IOException {
+		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+		URL url = new URL(url_s);
+		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestMethod("GET");
+		conn.setRequestProperty("User-Agent", "OpenAIRE");
+		Gson gson = new Gson();
+		System.out.println("Request header");
+		System.out.println(gson.toJson(conn.getHeaderFields()));
+		InputStream inputStream = conn.getInputStream();
+
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@ -44,7 +44,7 @@ public class RestIteratorTest {

 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
-    {
-      "value": "VALUE",
-      "type": "PIPPO"
-    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref

 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }

+  @Test
+  def mappingRecord(): Unit = {
+    val input =
+      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+  }
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test

@ -10,7 +11,6 @@ class MAGMappingTest {

  val mapper = new ObjectMapper()

-
  def mappingTest(): Unit = {

    val spark = SparkSession
@ -19,10 +19,8 @@ class MAGMappingTest {
      .master("local[*]")
      .getOrCreate()

-    val s = new SparkMagOrganizationAS(null, null, null)
-
-    s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
-
+    val s = new SparkMAGtoOAF(null, null, null)
+    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
  }

  @Test
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension

 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testEBIData() = {
-    val inputXML = Source
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-      .mkString
-    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }

@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {

-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -38,7 +38,6 @@
                </configuration>
            </plugin>
    	</plugins>
-    
    </build>

    <dependencies>
@ -54,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
-
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
-            <version>1.0.2</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
-            <version>2.11.0</version>
-        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -80,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>com.arakelian</groupId>
-            <artifactId>java-jq</artifactId>
-        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -102,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -189,7 +189,7 @@ public class DedupRecordFactory {
 			entity = swap;
 		}

-		entity = MergeUtils.checkedMerge(entity, duplicate);
+		entity = MergeUtils.checkedMerge(entity, duplicate, false);

 		if (ModelSupport.isSubClass(duplicate, Result.class)) {
 			Result re = (Result) entity;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;

@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
-						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));

 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
@ -175,6 +175,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			}

 			// cap pidType at w3id as from there on they are considered equal
+
 			UserDefinedFunction mapPid = udf(
 				(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
@ -44,9 +44,11 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 		parser.parseArgument(args);

 		SparkConf conf = new SparkConf();
-		new SparkCreateSimRels(parser, getSparkSession(conf))
+		try (SparkSession session = getSparkSession(conf)) {
+			new SparkCreateSimRels(parser, session)
 				.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
 		}
+	}

 	@Override
 	public void run(ISLookUpService isLookUpService)
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
 					"",
-					r._1()._2().getOriginalId().get(0),
-					r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
-					r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
-					r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
-					r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
-					r._1()._2().getCollectedfrom().get(0).getValue(),
+					Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
+					Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
+					Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
 					"",
 					structuredPropertyListToString(r._1()._2().getPid()),
 					parseECField(r._1()._2().getEclegalbody()),
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 					final Organization o = r._2()._2();
 					return new OrgSimRel(
 						r._1()._1(),
-						o.getOriginalId().get(0),
+						Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
 						Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
 					OrgSimRel orgSimRel = r._1()._2();
-					orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+					orgSimRel
+						.setLocal_id(
+							Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
 					return orgSimRel;
 				},
 				Encoders.bean(OrgSimRel.class));
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;

@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");

-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));

 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
@ -102,6 +102,8 @@
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=15000
+                --conf spark.network.timeout=300s
+                --conf spark.shuffle.registration.timeout=50000
            </spark-opts>
            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
            <arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
@ -33,16 +33,14 @@
            <description>max number of elements in a connected component</description>
        </property>
        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
+            <name>sparkResourceOpts</name>
+            <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
+            <description>spark resource options</description>
        </property>
        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
+            <name>sparkResourceOptsCreateMergeRel</name>
+            <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
+            <description>spark resource options</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
@ -119,9 +117,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -146,9 +142,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -174,9 +168,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOptsCreateMergeRel}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -203,9 +195,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -230,9 +220,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -257,9 +245,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -283,9 +269,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -309,9 +293,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                ${sparkResourceOpts}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -0,0 +1,103 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import scala.Tuple2;
+
+class DatasetMergerTest implements Serializable {
+
+	private List<Tuple2<String, Dataset>> datasets;
+
+	private String testEntityBasePath;
+	private DataInfo dataInfo;
+	private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148";
+	private Dataset dataset_top;
+
+	@BeforeEach
+	public void setUp() throws Exception {
+		testEntityBasePath = Paths
+			.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
+			.toFile()
+			.getAbsolutePath();
+
+		datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class);
+
+		dataset_top = getTopPub(datasets);
+
+		dataInfo = setDI();
+	}
+
+	@Test
+	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
+		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
+
+		// verify id
+		assertEquals(dedupId, pub_merged.getId());
+		assertEquals(2, pub_merged.getInstance().size());
+	}
+
+	public DataInfo setDI() {
+		DataInfo dataInfo = new DataInfo();
+		dataInfo.setTrust("0.9");
+		dataInfo.setDeletedbyinference(false);
+		dataInfo.setInferenceprovenance("testing");
+		dataInfo.setInferred(true);
+		return dataInfo;
+	}
+
+	public Dataset getTopPub(List<Tuple2<String, Dataset>> publications) {
+
+		Double maxTrust = 0.0;
+		Dataset maxPub = new Dataset();
+		for (Tuple2<String, Dataset> publication : publications) {
+			Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
+			if (pubTrust > maxTrust) {
+				maxTrust = pubTrust;
+				maxPub = publication._2();
+			}
+		}
+		return maxPub;
+	}
+
+	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+		List<Tuple2<String, T>> res = new ArrayList<>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(path));
+			String line = reader.readLine();
+			while (line != null) {
+				res
+					.add(
+						new Tuple2<>(
+							MapDocumentUtil.getJPathString("$.id", line),
+							new ObjectMapper().readValue(line, clazz)));
+				// read next line
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+
+		return res;
+	}
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable {
 		assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
 		assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
 		assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
-		assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
-		assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
-		assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
-		assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
+		assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid());
+		assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid());
+		assertEquals("Elsevier BV", pub_merged.getPublisher().getValue());
+		assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue());
 		assertEquals(pub_top.getResourcetype().getClassid(), "");
 		assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
 		assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
-		assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
+		// assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
 		assertEquals(3, pub_merged.getInstance().size());
 		assertEquals(2, pub_merged.getCountry().size());
 		assertEquals(0, pub_merged.getSubject().size());
@ -123,7 +123,7 @@ class EntityMergerTest implements Serializable {
 		assertEquals(dataInfo, pub_merged.getDataInfo());

 		// verify datepicker
-		assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
+		assertEquals("2016-01-01", pub_merged.getDateofacceptance().getValue());

 		// verify authors
 		assertEquals(13, pub_merged.getAuthor().size());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/IdGeneratorTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/IdGeneratorTest.java
@ -78,7 +78,7 @@ public class IdGeneratorTest {
 		System.out.println("winner 3 = " + id2);

 		assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
-		assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
+		assertEquals("50|dedup_wf_002::345e5d1b80537b0d0e0a49241ae9e516", id2);
 	}

 	@Test
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
@ -258,7 +258,6 @@ public class SparkDedupTest implements Serializable {
 			assertEquals(115, sw_simrel.count());
 		}

-
 		// check if the first relation in the whitelist exists
 		assertTrue(
 			sw_simrel
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java
@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
 			.count();

-		assertEquals(145, orgs_simrel);
+		assertEquals(86, orgs_simrel);
 	}

 	@Test
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
 			.count();

-		assertEquals(181, orgs_simrel);
+		assertEquals(122, orgs_simrel);
 	}

 	@Test
@ -196,7 +196,9 @@ public class SparkOpenorgsDedupTest implements Serializable {
 					"-la",
 					"lookupurl",
 					"-w",
-					testOutputBasePath
+					testOutputBasePath,
+					"-h",
+					""
 				});

 		new SparkCreateMergeRels(parser, spark).run(isLookUpService);
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java
@ -13,14 +13,16 @@ import java.io.Serializable;
 import java.net.URISyntaxException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.*;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
 import java.util.stream.Collectors;

 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
@ -129,7 +131,7 @@ public class SparkPublicationRootsTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
 			.count();

-		assertEquals(37, pubs_simrel);
+		assertEquals(9, pubs_simrel);
 	}

 	@Test
@ -142,7 +144,8 @@ public class SparkPublicationRootsTest implements Serializable {
 				"--actionSetId", testActionSetId,
 				"--isLookUpUrl", "lookupurl",
 				"--workingPath", workingPath,
-				"--cutConnectedComponent", "3"
+				"--cutConnectedComponent", "3",
+				"-h", ""
 			}), spark)
 				.run(isLookUpService);

@ -171,7 +174,8 @@ public class SparkPublicationRootsTest implements Serializable {
 				"--graphBasePath", graphInputPath,
 				"--actionSetId", testActionSetId,
 				"--isLookUpUrl", "lookupurl",
-				"--workingPath", workingPath
+				"--workingPath", workingPath,
+				"-h", ""
 			}), spark)
 				.run(isLookUpService);

@ -207,7 +211,7 @@ public class SparkPublicationRootsTest implements Serializable {
 			assertTrue(dups.contains(r.getSource()));
 		});

-		assertEquals(32, merges.count());
+		assertEquals(26, merges.count());
 	}

 	@Test
@ -228,7 +232,7 @@ public class SparkPublicationRootsTest implements Serializable {
 			.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
 			.map(asEntity(Publication.class), Encoders.bean(Publication.class));

-		assertEquals(3, roots.count());
+		assertEquals(4, roots.count());

 		final Dataset<Publication> pubs = spark
 			.read()
@ -369,7 +373,7 @@ public class SparkPublicationRootsTest implements Serializable {
 			.distinct()
 			.count();

-		assertEquals(19, publications); // 16 originals + 3 roots
+		assertEquals(20, publications); // 16 originals + 3 roots

 		long deletedPubs = spark
 			.read()
@ -380,7 +384,7 @@ public class SparkPublicationRootsTest implements Serializable {
 			.distinct()
 			.count();

-		assertEquals(mergedPubs, deletedPubs);
+//		assertEquals(mergedPubs, deletedPubs);
 	}

 	private static String classPathResourceAsString(String path) throws IOException {
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java
@ -169,10 +169,10 @@ public class SparkStatsTest implements Serializable {
 			.count();

 		assertEquals(414, orgs_blocks);
-		assertEquals(187, pubs_blocks);
-		assertEquals(128, sw_blocks);
-		assertEquals(192, ds_blocks);
-		assertEquals(194, orp_blocks);
+		assertEquals(221, pubs_blocks);
+		assertEquals(134, sw_blocks);
+		assertEquals(196, ds_blocks);
+		assertEquals(198, orp_blocks);
 	}

 	@AfterAll
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@ -30,7 +30,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
 import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
-import jdk.nashorn.internal.ir.annotations.Ignore;

 public class OrcidClientTest {
 	final int REQ_LIMIT = 24;
@ -48,7 +47,7 @@ public class OrcidClientTest {
 	private static Path testPath;

 	@BeforeAll
-	private static void setUp() throws IOException {
+	public static void setUp() throws IOException {
 		testPath = Files.createTempDirectory(OrcidClientTest.class.getName());
 		System.out.println("using test path: " + testPath);
 	}
@ -151,9 +150,9 @@ public class OrcidClientTest {
 		System.out.println(valueDt.toString());
 	}

-	// @Test
-	@Ignore
-	private void testModifiedDate() throws ParseException {
+	@Test
+	@Disabled
+	public void testModifiedDate() throws ParseException {
 		testDate(toRetrieveDate);
 		testDate(toNotRetrieveDate);
 		testDate(shortDate);
@ -226,7 +225,7 @@ public class OrcidClientTest {

 	@Test
 	@Disabled
-	private void slowedDownDownloadTest() throws Exception {
+	public void slowedDownDownloadTest() throws Exception {
 		String orcid = "0000-0001-5496-1243";
 		String record = slowedDownDownload(orcid);
 		String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
@ -332,7 +331,7 @@ public class OrcidClientTest {
 	}

 	@Test
-	@Ignore
+	@Disabled
 	void testUpdatedRecord() throws Exception {
 		final String base64CompressedRecord = IOUtils
 			.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
@ -341,7 +340,7 @@ public class OrcidClientTest {
 	}

 	@Test
-	@Ignore
+	@Disabled
 	void testUpdatedWork() throws Exception {
 		final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA==";
 		final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
@ -413,7 +412,7 @@ public class OrcidClientTest {
 	}

 	@Test
-	@Ignore
+	@Disabled
 	void testDownloadedAuthor() throws Exception {
 		final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA=";
 		final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
@ -421,7 +420,7 @@ public class OrcidClientTest {
 	}

 	@Test
-	@Ignore
+	@Disabled
 	void testDownloadedWork() throws Exception {
 		final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA==";
 		final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@ -4,21 +4,12 @@ package eu.dnetlib.dhp.bulktag;
 import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

-import java.io.BufferedOutputStream;
-import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.stream.Collectors;
-import java.util.zip.GZIPOutputStream;

-import org.apache.avro.TestAnnotation;
-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
@ -34,7 +25,6 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.gson.Gson;
-import com.sun.media.sound.ModelInstrumentComparator;

 import eu.dnetlib.dhp.api.Utils;
 import eu.dnetlib.dhp.api.model.CommunityEntityMap;
@ -182,7 +172,7 @@ public class SparkBulkTagJob {
 			.option("compression", "gzip")
 			.json(outputPath + "project");

-		readPath(spark, outputPath + "project", Datasource.class)
+		readPath(spark, outputPath + "project", Project.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/SparkResultToCommunityFromProject.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/SparkResultToCommunityFromProject.java
@ -161,7 +161,7 @@ public class SparkResultToCommunityFromProject implements Serializable {
 					}
 				}
 				res.setContext(propagatedContexts);
-				return MergeUtils.checkedMerge(ret, res);
+				return MergeUtils.checkedMerge(ret, res, true);
 			}
 			return ret;
 		};
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -100,16 +100,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.speculation=false
-                --conf spark.hadoop.mapreduce.map.speculative=false
-                --conf spark.hadoop.mapreduce.reduce.speculative=false
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -132,12 +128,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -160,12 +155,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -188,12 +182,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -218,12 +211,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
            <arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
@ -247,19 +239,14 @@
            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
            <jar>dhp-enrichment-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=4
-                --executor-memory=4G
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=5G
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
-                --conf spark.speculation=false
-                --conf spark.hadoop.mapreduce.map.speculative=false
-                --conf spark.hadoop.mapreduce.reduce.speculative=false
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
@ -282,15 +269,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
-                --conf spark.speculation=false
-                --conf spark.hadoop.mapreduce.map.speculative=false
-                --conf spark.hadoop.mapreduce.reduce.speculative=false
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
@ -312,15 +296,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
-                --conf spark.speculation=false
-                --conf spark.hadoop.mapreduce.map.speculative=false
-                --conf spark.hadoop.mapreduce.reduce.speculative=false
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
@ -342,15 +323,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
-                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
-                --conf spark.speculation=false
-                --conf spark.hadoop.mapreduce.map.speculative=false
-                --conf spark.hadoop.mapreduce.reduce.speculative=false
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
@ -363,15 +341,6 @@
    
    <join name="wait2" to="End"/>
    
-<!--    <action name="reset_workingDir">-->
-<!--        <fs>-->
-<!--            <delete path="${workingDir}"/>-->
-<!--            <mkdir path="${workingDir}"/>-->
-<!--        </fs>-->
-<!--        <ok to="End"/>-->
-<!--        <error to="Kill"/>-->
-<!--    </action>-->
-    
    <end name="End"/>
    
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -90,6 +90,12 @@
            <version>${project.version}</version>
        </dependency>

+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-pace-core</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.clean;
 import java.io.Serializable;
 import java.util.HashMap;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.concurrent.atomic.AtomicReference;

 import org.apache.commons.lang3.SerializationUtils;
@ -29,7 +30,10 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 		mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
 		mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
 		mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
-		mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
+
+		// commenting out the subject cleaning until we decide if we want to it or not and the implementation will
+		// be completed. At the moment it is not capable of expanding the whole hierarchy.
+		// mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
 		return mapping;
 	}

@ -38,6 +42,13 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 		// TODO cleaning based on different subject vocabs can be added here
 	}

+	/**
+	 * The procedure cleans out the subject values, using a vocabulary identified by the field subject.qualifier.classid.
+	 *
+	 * @param vocabularyId
+	 * @param vocabularies
+	 * @param subject
+	 */
 	private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
 		Subject subject) {

@ -49,9 +60,16 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 					subject.getQualifier().setClassid(vocabularyId);
 					subject.getQualifier().setClassname(vocabulary.getName());
 				}
-			} else if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
-				Objects.nonNull(subject.getDataInfo()) &&
-				!"subject:fos".equals(subject.getDataInfo().getProvenanceaction())) {
+			} else {
+				final String provenanceActionClassId = Optional
+					.ofNullable(subject.getDataInfo())
+					.map(DataInfo::getProvenanceaction)
+					.map(Qualifier::getClassid)
+					.orElse(null);
+
+				if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
+					!"subject:fos".equals(provenanceActionClassId)) {
+
 					Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
 					VocabularyTerm term = vocabulary.getTerm(subject.getValue());
 					if (Objects.isNull(syn) && Objects.isNull(term)) {
@ -59,6 +77,7 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 						subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
 					}
 				}
+			}
 		});
 	}

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -130,7 +130,8 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
 		switch (mode) {
 			case claim:
 				save(
-					inputRdd.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
+					inputRdd
+						.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
 						.groupByKey()
 						.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
 					// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 			o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
 			o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
 			o.setCountry(prepareQualifierSplitting(rs.getString("country")));
+			o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology")));
 			o.setDataInfo(info);
 			o.setLastupdatetimestamp(lastUpdateTimestamp);

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -156,6 +156,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -190,6 +191,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -224,6 +226,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -258,6 +261,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -292,6 +296,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -326,6 +331,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -360,6 +366,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -394,6 +401,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/Show More
+++ b/Show More