[graph provision] log the Solr admin application operations for alias deletion and creation

renamed workflow to better reflect its purpose
code formatting
2024-07-15 16:30:43 +02:00 · 2024-07-15 15:24:38 +02:00 · 2024-07-15 09:32:04 +02:00 · 2024-07-15 09:18:58 +02:00 · 2024-07-15 09:18:46 +02:00 · 2024-07-12 10:27:50 +02:00
153 changed files with 4812 additions and 2779 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
 /dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;
 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
 			mojo.execute();
 			Assertions.assertTrue(false); // not reached
 		} catch (Exception e) {
 			Assertions
 				.assertTrue(
 					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
 						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
 		}
 	}
 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -70,10 +70,7 @@
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
-		<dependency>
+
 			<groupId>org.apache.hadoop</groupId>
 			<artifactId>hadoop-common</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@
 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>
 		<dependency>
@ -172,4 +169,23 @@
 		</dependency>
 	</dependencies>
 	<!-- dependencies required on JDK9+ because J2EE has been removed -->
 	<profiles>
 		<profile>
 			<id>spark-34</id>
 			<dependencies>
 				<dependency>
 					<groupId>javax.xml.bind</groupId>
 					<artifactId>jaxb-api</artifactId>
 					<version>2.2.11</version>
 				</dependency>
 				<dependency>
 					<groupId>com.sun.xml.ws</groupId>
 					<artifactId>jaxws-ri</artifactId>
 					<version>2.3.3</version>
 					<type>pom</type>
 				</dependency>
 			</dependencies>
 		</profile>
 	</profiles>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
 	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
 	 *            concept_rec_id = 656930
 	 * @return response code
 	 * @throws IOException
 	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
 		setDepositionId(concept_rec_id, 1);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
 import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
@ -0,0 +1,106 @@
 package eu.dnetlib.dhp.schema.oaf.utils;
 import java.util.*;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class MergeEntitiesComparator implements Comparator<Oaf> {
 	static final List<String> PID_AUTHORITIES = Arrays
 		.asList(
 			ModelConstants.ARXIV_ID,
 			ModelConstants.PUBMED_CENTRAL_ID,
 			ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
 			ModelConstants.DATACITE_ID,
 			ModelConstants.CROSSREF_ID);
 	static final List<String> RESULT_TYPES = Arrays
 		.asList(
 			ModelConstants.ORP_RESULTTYPE_CLASSID,
 			ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
 			ModelConstants.DATASET_RESULTTYPE_CLASSID,
 			ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
 	public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
 	@Override
 	public int compare(Oaf left, Oaf right) {
 		if (left == null && right == null)
 			return 0;
 		if (left == null)
 			return -1;
 		if (right == null)
 			return 1;
 		int res = 0;
 		// pid authority
 		int cfp1 = Optional
 			.ofNullable(left.getCollectedfrom())
 			.map(
 				cf -> cf
 					.stream()
 					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
 					.max(Integer::compare)
 					.orElse(-1))
 			.orElse(-1);
 		int cfp2 = Optional
 			.ofNullable(right.getCollectedfrom())
 			.map(
 				cf -> cf
 					.stream()
 					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
 					.max(Integer::compare)
 					.orElse(-1))
 			.orElse(-1);
 		if (cfp1 >= 0 && cfp1 > cfp2) {
 			return 1;
 		} else if (cfp2 >= 0 && cfp2 > cfp1) {
 			return -1;
 		}
 		// trust
 		if (left.getDataInfo() != null && right.getDataInfo() != null) {
 			res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
 		}
 		// result type
 		if (res == 0) {
 			if (left instanceof Result && right instanceof Result) {
 				Result r1 = (Result) left;
 				Result r2 = (Result) right;
 				if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
 					if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
 						return -1;
 					}
 				} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
 					return 1;
 				}
 				int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
 				int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
 				if (rt1 >= 0 && rt1 > rt2) {
 					return 1;
 				} else if (rt2 >= 0 && rt2 > rt1) {
 					return -1;
 				}
 			}
 		}
 		// id
 		if (res == 0) {
 			if (left instanceof OafEntity && right instanceof OafEntity) {
 				res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
 			}
 		}
 		return res;
 	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -40,27 +40,12 @@ public class MergeUtils {
 	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
 		boolean checkDelegateAuthority) {
 		TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
 			int res = 0;
-			if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
+		ArrayList<T> sortedEntities = new ArrayList<>();
-				res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
+		oafEntityIterator.forEachRemaining(sortedEntities::add);
-			}
+		sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());
-			if (res == 0) {
+		Iterator<T> it = sortedEntities.iterator();
 				if (o1 instanceof Result && o2 instanceof Result) {
 					return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
 				}
 			}
 			return res;
 		});
 		while (oafEntityIterator.hasNext()) {
 			sortedEntities.add(oafEntityIterator.next());
 		}
 		Iterator<T> it = sortedEntities.descendingIterator();
 		T merged = it.next();
 		while (it.hasNext()) {
@ -143,7 +128,7 @@ public class MergeUtils {
 	 * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
 	 * such version.
 	 * <p>
-	 * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
+	 * Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
 	 * and proceeds with the canonical property merging.
 	 *
 	 * @param left
@ -161,8 +146,9 @@ public class MergeUtils {
 		if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
 			return right;
 		}
 		// TODO: raise trust to have preferred fields from one or the other??
-		if (new ResultTypeComparator().compare(left, right) < 0) {
+		if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
 			return mergeResultFields(left, right);
 		} else {
 			return mergeResultFields(right, left);
@ -225,9 +211,9 @@ public class MergeUtils {
 	private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
 		Function<T, K> keyExtractor, BinaryOperator<T> merger) {
-		if (left == null) {
+		if (left == null || left.isEmpty()) {
-			return right;
+			return right != null ? right : new ArrayList<>();
-		} else if (right == null) {
+		} else if (right == null || right.isEmpty()) {
 			return left;
 		}
@ -342,7 +328,7 @@ public class MergeUtils {
 		final T merged = mergeOafFields(original, enrich, trust);
 		merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
-		merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
+		merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
 		merged.setDateofcollection(LocalDateTime.now().toString());
 		merged
 			.setDateoftransformation(
@ -405,7 +391,7 @@ public class MergeUtils {
 		}
 		// should be an instance attribute, get the first non-null value
-		merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
+		merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));
 		// distinct countries, do not manage datainfo
 		merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
@ -575,6 +561,13 @@ public class MergeUtils {
 		return m != null ? m : e;
 	}
 	private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
 		if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
 			return e;
 		}
 		return m;
 	}
 	private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
 		List<List<Author>> authors = new ArrayList<>();
 		if (author != null) {
@ -587,6 +580,10 @@ public class MergeUtils {
 	}
 	private static String instanceKeyExtractor(Instance i) {
 		// three levels of concatenating:
 		// 1. ::
 		// 2. @@
 		// 3. ||
 		return String
 			.join(
 				"::",
@ -594,10 +591,10 @@ public class MergeUtils {
 				kvKeyExtractor(i.getCollectedfrom()),
 				qualifierKeyExtractor(i.getAccessright()),
 				qualifierKeyExtractor(i.getInstancetype()),
-				Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
+				Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
 				Optional
 					.ofNullable(i.getPid())
-					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
+					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
 					.orElse(null));
 	}
@ -658,6 +655,13 @@ public class MergeUtils {
 			return d1;
 		}
 		if (StringUtils.contains(d1.getValue(), "null")) {
 			return d2;
 		}
 		if (StringUtils.contains(d2.getValue(), "null")) {
 			return d1;
 		}
 		return Stream
 			.of(d1, d2)
 			.min(
@ -706,7 +710,7 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
+			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
@ -1,87 +0,0 @@
 package eu.dnetlib.dhp.schema.oaf.utils;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class ResultTypeComparator implements Comparator<Result> {
 	public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
 	@Override
 	public int compare(Result left, Result right) {
 		if (left == null && right == null)
 			return 0;
 		if (left == null)
 			return 1;
 		if (right == null)
 			return -1;
 		HashSet<String> lCf = getCollectedFromIds(left);
 		HashSet<String> rCf = getCollectedFromIds(right);
 		if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
 			return -1;
 		}
 		if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
 			return 1;
 		}
 		if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
 			if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
 				return 0;
 			}
 			return 1;
 		} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
 			return -1;
 		}
 		String lClass = left.getResulttype().getClassid();
 		String rClass = right.getResulttype().getClassid();
 		if (!lClass.equals(rClass)) {
 			if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
 				return 1;
 			if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
 				return 1;
 			if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
 				return 1;
 			if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
 				return 1;
 		}
 		// Else (but unlikely), lexicographical ordering will do.
 		return lClass.compareTo(rClass);
 	}
 	protected HashSet<String> getCollectedFromIds(Result left) {
 		return Optional
 			.ofNullable(left.getCollectedfrom())
 			.map(
 				cf -> cf
 					.stream()
 					.map(KeyValue::getKey)
 					.collect(Collectors.toCollection(HashSet::new)))
 			.orElse(new HashSet<>());
 	}
 }
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
  },
  "isamongtopnsimilardocuments": {
    "original": "IsAmongTopNSimilarDocuments",
    "inverse": "HasAmongTopNSimilarDocuments"
  },
  "hasamongtopnsimilardocuments": {
    "original": "HasAmongTopNSimilarDocuments",
    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
+    if (master != null)
-      .getOrCreate()
+      b.master(master)
    b.getOrCreate()
  }
  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }
  def generateScholixResourceFromResult(r: Result): ScholixResource = {
-    generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    val sum = ScholixUtils.resultToSummary(r)
    if (sum != null)
      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
    else
      null
  }
  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
  }
  def invRel(rel: String): String = {
    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
    if (semanticRelation != null)
      semanticRelation.inverse
    else
      null
  }
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
+//    s.setTypology(r.getResulttype.getClassid)
      s.setTypology(Typology.publication)
    else
      s.setTypology(Typology.dataset)
    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -59,14 +59,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.google.guava</groupId>
 			<artifactId>guava</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.google.code.gson</groupId>
 			<artifactId>gson</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-math3</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
 		</dependency>
 	</dependencies>
 	<profiles>
 		<profile>
 			<id>spark-24</id>
 			<activation>
 				<activeByDefault>true</activeByDefault>
 			</activation>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-2</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 		<profile>
 			<id>spark-34</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-2</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 		<profile>
 			<id>spark-35</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-35</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 	</profiles>
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -1,12 +1,6 @@
 package eu.dnetlib.pace.common;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 /**
 * Set of common functions for the framework
 *
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }
  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java
@ -0,0 +1,47 @@
 package eu.dnetlib.pace.tree;
 import java.util.Map;
 import com.wcohen.ss.AbstractStringDistance;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.tree.support.AbstractStringComparator;
 import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("countryMatch")
 public class CountryMatch extends AbstractStringComparator {
    public CountryMatch(Map<String, String> params) {
        super(params, new com.wcohen.ss.JaroWinkler());
    }
    public CountryMatch(final double weight) {
        super(weight, new com.wcohen.ss.JaroWinkler());
    }
    protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
        super(weight, ssalgo);
    }
    @Override
    public double distance(final String a, final String b, final Config conf) {
        if (a.isEmpty() || b.isEmpty()) {
            return -1.0; // return -1 if a field is missing
        }
        if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
            return -1.0; // return -1 if a country is UNKNOWN
        }
        return a.equals(b) ? 1.0 : 0;
    }
    @Override
    public double getWeight() {
        return super.weight;
    }
    @Override
    protected double normalize(final double d) {
        return d;
    }
 }
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.pace.util
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.types.StructType
 object SparkCompatUtils {
  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
    RowEncoder(schema)
  }
 }
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.pace.util
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.types.StructType
 object SparkCompatUtils {
  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
    ExpressionEncoder(schema)
  }
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -336,4 +336,23 @@ public class ComparatorTest extends AbstractPaceTest {
 		System.out.println("compare = " + compare);
 	}
 	@Test
 	public void countryMatch() {
 		CountryMatch countryMatch = new CountryMatch(params);
 		double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
 		assertEquals(-1.0, result);
 		result = countryMatch.distance("CHILE", "UNKNOWN", conf);
 		assertEquals(-1.0, result);
 		result = countryMatch.distance("CHILE", "ITALY", conf);
 		assertEquals(0.0, result);
 		result = countryMatch.distance("CHILE", "CHILE", conf);
 		assertEquals(1.0, result);
 	}
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.pace.model.Person;
 import jdk.nashorn.internal.ir.annotations.Ignore;
 public class UtilTest {
--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -0,0 +1,113 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <parent>
    <artifactId>dhp</artifactId>
    <groupId>eu.dnetlib.dhp</groupId>
    <version>1.2.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <artifactId>dhp-shade-package</artifactId>
  <description>This module create a jar of all module dependencies</description>
  <build>
    <plugins>
      <plugin>
        <artifactId>maven-shade-plugin</artifactId>
        <executions>
          <execution>
            <phase>package</phase>
            <goals>
              <goal>shade</goal>
            </goals>
            <configuration>
              <transformers>
                <transformer>
                  <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
                </transformer>
                <transformer />
                <transformer>
                  <resource>META-INF/cxf/bus-extensions.txt</resource>
                </transformer>
              </transformers>
              <filters>
                <filter>
                  <artifact>*:*</artifact>
                  <excludes>
                    <exclude>META-INF/maven/**</exclude>
                    <exclude>META-INF/*.SF</exclude>
                    <exclude>META-INF/*.DSA</exclude>
                    <exclude>META-INF/*.RSA</exclude>
                  </excludes>
                </filter>
              </filters>
              <relocations>
                <relocation>
                  <pattern>com</pattern>
                  <shadedPattern>repackaged.com.google.common</shadedPattern>
                  <includes>
                    <include>com.google.common.**</include>
                  </includes>
                </relocation>
              </relocations>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
  <dependencies>
    <dependency>
      <groupId>org.projectlombok</groupId>
      <artifactId>lombok</artifactId>
      <version>1.18.28</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter</artifactId>
      <version>5.6.1</version>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <artifactId>junit-jupiter-api</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
        <exclusion>
          <artifactId>junit-jupiter-params</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
        <exclusion>
          <artifactId>junit-jupiter-engine</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>org.mockito</groupId>
      <artifactId>mockito-core</artifactId>
      <version>3.3.3</version>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <artifactId>byte-buddy</artifactId>
          <groupId>net.bytebuddy</groupId>
        </exclusion>
        <exclusion>
          <artifactId>byte-buddy-agent</artifactId>
          <groupId>net.bytebuddy</groupId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>org.mockito</groupId>
      <artifactId>mockito-junit-jupiter</artifactId>
      <version>3.3.3</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
  <distributionManagement>
    <site>
      <id>DHPSite</id>
      <url>${dhp.site.stage.path}/dhp-common</url>
    </site>
  </distributionManagement>
 </project>
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp</artifactId>
        <version>1.2.5-SNAPSHOT</version>
        <relativePath>../pom.xml</relativePath>
    </parent>
    <artifactId>dhp-shade-package</artifactId>
    <packaging>jar</packaging>
    <distributionManagement>
        <site>
            <id>DHPSite</id>
            <url>${dhp.site.stage.path}/dhp-common</url>
        </site>
    </distributionManagement>
    <description>This module create a jar of all module dependencies</description>
    <dependencies>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-actionmanager</artifactId>
            <version>${project.version}</version>
        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-aggregation</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-blacklist</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-broker-events</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-dedup-openaire</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-enrichment</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-mapper</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-provision</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-impact-indicators</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-actionsets</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-hist-snaps</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-monitor-irish</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-promote</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-update</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-swh</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-usage-raw-data-update</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-usage-stats-build</artifactId>
            <version>${project.version}</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
                                </transformer>
                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/maven/**</exclude>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <relocations>
                                <relocation>
                                    <pattern>com</pattern>
                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
                                    <includes>
                                        <include>com.google.common.**</include>
                                    </includes>
                                </relocation>
                            </relocations>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,6 +157,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -125,6 +125,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -103,11 +103,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -156,11 +157,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -95,11 +95,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=10000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -9,6 +9,7 @@ import java.util.List;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
 					.union(openAPCRelations)
 					.union(dataciteRelations)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -115,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
 			.forEach(
 				l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
 		r.setSubject(sbjs);
-		r
+
 			.setDataInfo(
 				OafMapperUtils
 					.dataInfo(
 						false, null, true,
 						false,
 						OafMapperUtils
 							.qualifier(
 								ModelConstants.PROVENANCE_ENRICH,
 								null,
 								ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						null));
 		return r;
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
 						s -> sbjs
 							.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
 				r.setSubject(sbjs);
-				r
+
 					.setDataInfo(
 						OafMapperUtils
 							.dataInfo(
 								false, null, true,
 								false,
 								OafMapperUtils
 									.qualifier(
 										ModelConstants.PROVENANCE_ENRICH,
 										null,
 										ModelConstants.DNET_PROVENANCE_ACTIONS,
 										ModelConstants.DNET_PROVENANCE_ACTIONS),
 								null));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.types.StructType;
@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		final String blackListInputPath = parser.get("blackListPath");
 		log.info("blackListInputPath: {}", blackListInputPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
@ -77,29 +81,35 @@ public class CreateActionSetFromWebEntries implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
-				createActionSet(spark, inputPath, outputPath);
+				createActionSet(spark, inputPath, outputPath, blackListInputPath);
 			});
 	}
 	public static void createActionSet(SparkSession spark, String inputPath,
-		String outputPath) {
+		String outputPath, String blackListInputPath) {
 		final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
-			.filter("publication_year <= 2020 or country_code=='IE'")
+			.filter("country_code=='IE'")
 			.drop("publication_year");
-		dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
+		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
 			List<Relation> ret = new ArrayList<>();
 			final String ror = ROR_PREFIX
 				+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
 			ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
 			ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
 			ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
-			return ret
+		dataset
-				.iterator();
+			.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
-		}, Encoders.bean(Relation.class))
+			.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
 			.drop("OpenAlexId")
 			.flatMap((FlatMapFunction<Row, Relation>) row -> {
 				List<Relation> ret = new ArrayList<>();
 				final String ror = ROR_PREFIX
 					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
 				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
 				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
 				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
 				return ret
 					.iterator();
 			}, Encoders.bean(Relation.class))
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.mapToPair(
@ -136,6 +146,15 @@ public class CreateActionSetFromWebEntries implements Serializable {
 	}
 	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
 		return spark
 			.read()
 			.option("header", true)
 			.csv(inputPath)
 			.select("OpenAlexId");
 	}
 	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
 		if (pmcid == null)
 			return new ArrayList<>();
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.collection.plugin.rest;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
 import org.apache.commons.lang3.StringUtils;
 import com.google.gson.Gson;
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
 		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
 		Gson gson = new Gson();
 		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
 		if (StringUtils.isBlank(queryParams)) {
 			throw new CollectorException("Param 'queryParams' is null or empty");
 		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
 			requestHeaders);
 		return StreamSupport
 			.stream(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -9,8 +9,11 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@ -22,20 +25,20 @@ import javax.xml.xpath.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.http.HttpHeaders;
 import org.apache.http.entity.ContentType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;
 /**
- * log.info(...) equal to  log.trace(...) in the application-logs
+ * log.info(...) equal to log.trace(...) in the application-logs
 * <p>
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
 *
@ -44,24 +47,29 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 *
 */
 public class RestIterator implements Iterator<String> {
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
 	private static final int MAX_ATTEMPTS = 5;
 	private final HttpClientParams clientParams;
-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
 	private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
 	private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
 		+ ">";
 	private final String baseUrl;
 	private final String resumptionType;
 	private final String resumptionParam;
 	private final String resultFormatValue;
-	private String queryParams;
+	private String queryParams = "";
 	private final int resultSizeValue;
 	private int resumptionInt = 0; // integer resumption token (first record to harvest)
 	private int resultTotal = -1;
-	private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
+	private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
-																	// or token scanned from results)
+																			// harvest
 	// or token scanned from results)
 	private InputStream resultStream;
 	private Transformer transformer;
 	private XPath xpath;
@ -73,7 +81,7 @@ public class RestIterator implements Iterator<String> {
 	private final String querySize;
 	private final String authMethod;
 	private final String authToken;
-	private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
+	private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
 	private int discoverResultSize = 0;
 	private int pagination = 1;
 	/*
@ -83,8 +91,13 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;
-	/** RestIterator class
+	/*
-	 *  compatible to version 1.3.33
+	 * Can be used to set additional request headers, like for content negotiation
 	 */
 	private Map<String, String> requestHeaders;
 	/**
 	 * RestIterator class compatible to version 1.3.33
 	 */
 	public RestIterator(
 		final HttpClientParams clientParams,
@ -101,47 +114,56 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
 		final Map<String, String> requestHeaders) {
 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
 		this.resumptionType = resumptionType;
 		this.resumptionParam = resumptionParam;
 		this.resultFormatValue = resultFormatValue;
-		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+		this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
 		this.queryParams = queryParams;
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
 		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
-		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
 			: "";
 		this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
 			: "";
 		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
 		try {
 			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
-		} catch (Exception e) {
+		} catch (final Exception e) {
 			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
 		}
 		initQueue();
 	}
-	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+	private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
 		final String entityXpath)
 		throws TransformerConfigurationException, XPathExpressionException {
 		final TransformerFactory factory = TransformerFactory.newInstance();
-		transformer = factory.newTransformer();
+		this.transformer = factory.newTransformer();
-		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+		this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+		this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
-		xpath = XPathFactory.newInstance().newXPath();
+		this.xpath = XPathFactory.newInstance().newXPath();
-		xprResultTotalPath = xpath.compile(resultTotalXpath);
+		this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
-		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+		this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
-		xprEntity = xpath.compile(entityXpath);
+		this.xprEntity = this.xpath.compile(entityXpath);
 	}
 	private void initQueue() {
-		query = baseUrl + "?" + queryParams + querySize + queryFormat;
+		if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
-		log.info("REST calls starting with {}", query);
+			query = baseUrl;
 		} else {
 			query = baseUrl + "?" + queryParams + querySize + queryFormat;
 		}
 		log.info("REST calls starting with {}", this.query);
 	}
 	private void disconnect() {
@ -154,11 +176,22 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public boolean hasNext() {
-		if (recordQueue.isEmpty() && query.isEmpty()) {
+		synchronized (this.recordQueue) {
 			while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
 				try {
 					this.query = downloadPage(this.query, 0);
 				} catch (final CollectorException e) {
 					log.debug("CollectorPlugin.next()-Exception: {}", e);
 					throw new RuntimeException(e);
 				}
 			}
 			if (!this.recordQueue.isEmpty()) {
 				return true;
 			}
 			disconnect();
 			return false;
 		} else {
 			return true;
 		}
 	}
@ -168,214 +201,248 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public String next() {
-		synchronized (recordQueue) {
+		synchronized (this.recordQueue) {
-			while (recordQueue.isEmpty() && !query.isEmpty()) {
+			return this.recordQueue.poll();
 				try {
 					query = downloadPage(query);
 				} catch (CollectorException e) {
 					log.debug("CollectorPlugin.next()-Exception: {}", e);
 					throw new RuntimeException(e);
 				}
 			}
 			return recordQueue.poll();
 		}
 	}
 	/*
-	 * download page and return nextQuery
+	 * download page and return nextQuery (with number of attempt)
 	 */
-	private String downloadPage(String query) throws CollectorException {
+	private String downloadPage(String query, final int attempt) throws CollectorException {
 		String resultJson;
 		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
 		String nextQuery = "";
 		String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
 		Node resultNode = null;
 		NodeList nodeList = null;
 		String qUrlArgument = "";
 		int urlOldResumptionSize = 0;
 		InputStream theHttpInputStream;
-		// check if cursor=* is initial set otherwise add it to the queryParam URL
+		if (attempt > MAX_ATTEMPTS) {
-		if (resumptionType.equalsIgnoreCase("deep-cursor")) {
+			throw new CollectorException("Max Number of attempts reached, query:" + query);
-			log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
+		}
-			if (!query.contains("&cursor=")) {
+
-				query += "&cursor=*";
+		if (attempt > 0) {
 			final int delay = (attempt * 5000);
 			log.debug("Attempt {} with delay {}", attempt, delay);
 			try {
 				Thread.sleep(delay);
 			} catch (final InterruptedException e) {
 				new CollectorException(e);
 			}
 		}
 		try {
-			log.info("requestig URL [{}]", query);
+			String resultJson;
 			String resultXml = XML_HEADER;
 			String nextQuery = "";
 			Node resultNode = null;
 			NodeList nodeList = null;
 			String qUrlArgument = "";
 			int urlOldResumptionSize = 0;
 			InputStream theHttpInputStream;
-			URL qUrl = new URL(query);
+			// check if cursor=* is initial set otherwise add it to the queryParam URL
-			log.debug("authMethod: {}", authMethod);
+			if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
-			if ("bearer".equalsIgnoreCase(this.authMethod)) {
+				log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
-				log.trace("authMethod before inputStream: {}", resultXml);
+				if (!query.contains("&cursor=")) {
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+					query += "&cursor=*";
 				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
 				conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
 				conn.setRequestMethod("GET");
 				theHttpInputStream = conn.getInputStream();
 			} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
 				log.trace("authMethod before inputStream: {}", resultXml);
 				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
 				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
 				conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
 				conn.setRequestMethod("GET");
 				theHttpInputStream = conn.getInputStream();
 			} else {
 				theHttpInputStream = qUrl.openStream();
 			}
 			resultStream = theHttpInputStream;
 			if ("json".equals(resultOutputFormat)) {
 				resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
 				resultXml = JsonUtils.convertToXML(resultJson);
 				resultStream = IOUtils.toInputStream(resultXml, UTF_8);
 			}
 			if (!(emptyXml).equalsIgnoreCase(resultXml)) {
 				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
 				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
 				log.debug("nodeList.length: {}", nodeList.getLength());
 				for (int i = 0; i < nodeList.getLength(); i++) {
 					StringWriter sw = new StringWriter();
 					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
 					String toEnqueue = sw.toString();
 					if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
 						log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
 					} else {
 						recordQueue.add(sw.toString());
 					}
 				}
 			} else {
 				log.warn("resultXml is equal with emptyXml");
 			}
-			resumptionInt += resultSizeValue;
+			// find pagination page start number in queryParam and remove before start the first query
 			if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
 				&& (query.contains("paginationStart="))) {
-			switch (resumptionType.toLowerCase()) {
+				final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
-				case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
+				m.find(); // guaranteed to be true for this regex
 					resumptionStr = xprResumptionPath.evaluate(resultNode);
 					break;
-				case "count": // begin at one step for all records, iterate over items
+				String[] pageVal = m.group(0).split("=");
-					resumptionStr = Integer.toString(resumptionInt);
+				pagination = Integer.parseInt(pageVal[1]);
 					break;
-				case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
+				// remove page start number from query and queryParams
-					if (resultSizeValue < 2) {
+				queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
-						throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
+				query = query.replaceFirst("&?paginationStart=[0-9]+", "");
 			}
 			try {
 				log.info("requesting URL [{}]", query);
 				final URL qUrl = new URL(query);
 				log.debug("authMethod: {}", this.authMethod);
 				if (this.authMethod == "bearer") {
 					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
 					requestHeaders.put("Authorization", "Bearer " + authToken);
 					// requestHeaders.put("Content-Type", "application/json");
 				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
 					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
 					requestHeaders.put("Authorization", "Basic " + authToken);
 					// requestHeaders.put("accept", "application/xml");
 				}
 				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
 				conn.setRequestMethod("GET");
 				this.setRequestHeader(conn);
 				resultStream = conn.getInputStream();
 				if ("json".equals(this.resultOutputFormat)) {
 					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
 					resultXml = JsonUtils.convertToXML(resultJson);
 					this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
 				}
 				if (!isEmptyXml(resultXml)) {
 					resultNode = (Node) this.xpath
 						.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
 					nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
 					log.debug("nodeList.length: {}", nodeList.getLength());
 					for (int i = 0; i < nodeList.getLength(); i++) {
 						final StringWriter sw = new StringWriter();
 						this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
 						final String toEnqueue = sw.toString();
 						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
 							log
 								.warn(
 									"The following record resulted in empty item for the feeding queue: {}", resultXml);
 						} else {
 							this.recordQueue.add(sw.toString());
 						}
 					}
-					qUrlArgument = qUrl.getQuery();
+				} else {
-					String[] arrayQUrlArgument = qUrlArgument.split("&");
+					log.warn("resultXml is equal with emptyXml");
-					for (String arrayUrlArgStr : arrayQUrlArgument) {
+				}
-						if (arrayUrlArgStr.startsWith(resumptionParam)) {
+
-							String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+				this.resumptionInt += this.resultSizeValue;
-							if (isInteger(resumptionKeyValue[1])) {
+
-								urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
+				switch (this.resumptionType.toLowerCase()) {
-								log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
+					case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
-							} else {
+						this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
-								log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
+						break;
 					case "count": // begin at one step for all records, iterate over items
 						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;
 					case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
 						if (this.resultSizeValue < 2) {
 							throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
 						}
 						qUrlArgument = qUrl.getQuery();
 						final String[] arrayQUrlArgument = qUrlArgument.split("&");
 						for (final String arrayUrlArgStr : arrayQUrlArgument) {
 							if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
 								final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
 								if (isInteger(resumptionKeyValue[1])) {
 									urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
 									log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
 								} else {
 									log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
 								}
 							}
 						}
 					}
-					if (((emptyXml).equalsIgnoreCase(resultXml))
+						if (isEmptyXml(resultXml)
-						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
+							|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
-						// resumptionStr = "";
+							// resumptionStr = "";
-						if (nodeList != null) {
+							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
 							this.resultTotal = this.discoverResultSize;
 						} else {
 							this.resumptionStr = Integer.toString(this.resumptionInt);
 							this.resultTotal = this.resumptionInt + 1;
 							if (nodeList != null) {
 								this.discoverResultSize += nodeList.getLength();
 							}
 						}
-						resultTotal = discoverResultSize;
+						log.info("discoverResultSize: {}", this.discoverResultSize);
-					} else {
+						break;
-						resumptionStr = Integer.toString(resumptionInt);
+
-						resultTotal = resumptionInt + 1;
+					case "pagination":
-						if (nodeList != null) {
+					case "page": // pagination, iterate over page numbers
-							discoverResultSize += nodeList.getLength();
+						if (nodeList != null && nodeList.getLength() > 0) {
 							this.discoverResultSize += nodeList.getLength();
 						} else {
 							this.resultTotal = this.discoverResultSize;
 							this.pagination = this.discoverResultSize;
 						}
-					}
+						this.pagination += 1;
-					log.info("discoverResultSize: {}", discoverResultSize);
+						this.resumptionInt = this.pagination;
-					break;
+						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;
-				case "pagination":
+					case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
-				case "page": // pagination, iterate over page numbers
+										// in
-					pagination += 1;
+										// solr)
-					if (nodeList != null) {
+						// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
-						discoverResultSize += nodeList.getLength();
+						// deep-cursor, Param 'resultSizeValue' is less than 2");}
 					} else {
 						resultTotal = discoverResultSize;
 						pagination = discoverResultSize;
 					}
 					resumptionInt = pagination;
 					resumptionStr = Integer.toString(resumptionInt);
 					break;
-				case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
+						this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
-									// solr)
+						this.queryParams = this.queryParams.replace("&cursor=*", "");
 					// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
 					// deep-cursor, Param 'resultSizeValue' is less than 2");}
-					resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
+						// terminating if length of nodeList is 0
-					queryParams = queryParams.replace("&cursor=*", "");
+						if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
 							this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
 						} else {
 							this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
 																									// resultSizeValue
 							// because the iteration is over
 							// real length and the
 							// resultSizeValue is added before
 							// the switch()
 						}
-					// terminating if length of nodeList is 0
+						this.discoverResultSize = nodeList.getLength();
 					if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
 						resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
 					} else {
 						resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
 																					// because the iteration is over
 																					// real length and the
 																					// resultSizeValue is added before
 																					// the switch()
 					}
-					discoverResultSize = nodeList.getLength();
+						log
 							.debug(
 								"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
 									+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
-					log
+						break;
 						.debug(
 							"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
 								+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
-					break;
+					default: // otherwise: abort
 						// resultTotal = resumptionInt;
 						break;
 				}
-				default: // otherwise: abort
+			} catch (final Exception e) {
-					// resultTotal = resumptionInt;
+				log.error(e.getMessage(), e);
-					break;
+				throw new IllegalStateException("collection failed: " + e.getMessage());
 			}
-		} catch (Exception e) {
+			try {
-			log.error(e.getMessage(), e);
+				if (this.resultTotal == -1) {
-			throw new IllegalStateException("collection failed: " + e.getMessage());
+					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
-		}
+					if ("page".equalsIgnoreCase(this.resumptionType)
-
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
-		try {
+						this.resultTotal += 1;
-			if (resultTotal == -1) {
+					} // to correct the upper bound
-				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
+					log.info("resultTotal was -1 is now: " + this.resultTotal);
-				if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
+				}
-					resultTotal += 1;
+			} catch (final Exception e) {
-				} // to correct the upper bound
+				log.error(e.getMessage(), e);
-				log.info("resultTotal was -1 is now: " + resultTotal);
+				throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
 			}
-		} catch (Exception e) {
+			log.debug("resultTotal: " + this.resultTotal);
-			log.error(e.getMessage(), e);
+			log.debug("resInt: " + this.resumptionInt);
-			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
+			if (this.resumptionInt <= this.resultTotal) {
 				nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
 					+ this.resumptionStr
 					+ this.queryFormat;
 			} else {
 				nextQuery = "";
 				// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
 				// resumptionInt and prevent a NullPointer Exception at mdStore
 			}
 			log.debug("nextQueryUrl: " + nextQuery);
 			return nextQuery;
 		} catch (final Throwable e) {
 			log.warn(e.getMessage(), e);
 			return downloadPage(query, attempt + 1);
 		}
 		log.debug("resultTotal: " + resultTotal);
 		log.debug("resInt: " + resumptionInt);
 		if (resumptionInt <= resultTotal) {
 			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
 				+ queryFormat;
 		} else {
 			nextQuery = "";
 			// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
 			// resumptionInt and prevent a NullPointer Exception at mdStore
 		}
 		log.debug("nextQueryUrl: " + nextQuery);
 		return nextQuery;
 	}
-	private boolean isInteger(String s) {
+	private boolean isEmptyXml(String s) {
 		return EMPTY_XML.equalsIgnoreCase(s);
 	}
 	private boolean isInteger(final String s) {
 		boolean isValidInteger = false;
 		try {
 			Integer.parseInt(s);
@ -383,7 +450,7 @@ public class RestIterator implements Iterator<String> {
 			// s is a valid integer
 			isValidInteger = true;
-		} catch (NumberFormatException ex) {
+		} catch (final NumberFormatException ex) {
 			// s is not an integer
 		}
@ -391,20 +458,36 @@ public class RestIterator implements Iterator<String> {
 	}
 	// Method to encode a string value using `UTF-8` encoding scheme
-	private String encodeValue(String value) {
+	private String encodeValue(final String value) {
 		try {
 			return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
-		} catch (UnsupportedEncodingException ex) {
+		} catch (final UnsupportedEncodingException ex) {
 			throw new RuntimeException(ex.getCause());
 		}
 	}
 	/**
 	 * setRequestHeader
 	 *
 	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
 	 * @param conn
 	 */
 	private void setRequestHeader(HttpURLConnection conn) {
 		if (requestHeaders != null) {
 			for (String key : requestHeaders.keySet()) {
 				conn.setRequestProperty(key, requestHeaders.get(key));
 			}
 			log.debug("Set Request Header with: " + requestHeaders);
 		}
 	}
 	public String getResultFormatValue() {
-		return resultFormatValue;
+		return this.resultFormatValue;
 	}
 	public String getResultOutputFormat() {
-		return resultOutputFormat;
+		return this.resultOutputFormat;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -8,7 +8,10 @@ import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Collectors;
 import javax.xml.stream.XMLEventFactory;
 import javax.xml.stream.XMLEventReader;
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.StartElement;
 import javax.xml.stream.events.XMLEvent;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {
 	private String element;
 	private List<String> elements;
 	private InputStream inputStream;
 	public XMLIterator(final String element, final InputStream inputStream) {
 		super();
 		this.element = element;
 		if (element.contains(",")) {
 			elements = Arrays
 				.stream(element.split(","))
 				.filter(StringUtils::isNoneBlank)
 				.map(String::toLowerCase)
 				.collect(Collectors.toList());
 		}
 		this.inputStream = inputStream;
 		this.parser = getParser();
 		try {
 			this.current = findElement(parser);
 		} catch (XMLStreamException e) {
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
 				final XMLEvent event = parser.nextEvent();
 				// TODO: replace with depth tracking instead of close tag tracking.
-				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+				if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
 					writer.add(event);
 					break;
 				}
@ -142,18 +156,16 @@ public class XMLIterator implements Iterator<String> {
 		XMLEvent peek = parser.peek();
 		if (peek != null && peek.isStartElement()) {
 			String name = peek.asStartElement().getName().getLocalPart();
-			if (element.equals(name)) {
+			if (isCheckTag(name))
 				return peek;
 			}
 		}
 		while (parser.hasNext()) {
-			final XMLEvent event = parser.nextEvent();
+			XMLEvent event = parser.nextEvent();
 			if (event != null && event.isStartElement()) {
 				String name = event.asStartElement().getName().getLocalPart();
-				if (element.equals(name)) {
+				if (isCheckTag(name))
 					return event;
 				}
 			}
 		}
 		return null;
@ -161,12 +173,31 @@ public class XMLIterator implements Iterator<String> {
 	private XMLEventReader getParser() {
 		try {
-			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+			XMLInputFactory xif = inputFactory.get();
 			xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
 			return xif.createXMLEventReader(sanitize(inputStream));
 		} catch (XMLStreamException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private boolean isCheckTag(final String tagName) {
 		if (elements != null) {
 			final String found = elements
 				.stream()
 				.filter(e -> e.equalsIgnoreCase(tagName))
 				.findFirst()
 				.orElse(null);
 			if (found != null)
 				return true;
 		} else {
 			if (element.equalsIgnoreCase(tagName)) {
 				return true;
 			}
 		}
 		return false;
 	}
 	private Reader sanitize(final InputStream in) {
 		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
 		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@ -16,5 +16,10 @@
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
-  }
+  },{
  "paramName": "bl",
  "paramLongName": "blackListPath",
  "paramDescription": "the working path",
  "paramRequired": true
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@ -1,2 +1,3 @@
 sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
 outputPath=/tmp/miriam/webcrawlComplete/
 blackListPath=/user/miriam.baglioni/openalex-blackList
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@ -45,6 +45,7 @@
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--blackListPath</arg><arg>${blackListPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -1,10 +1,5 @@
 [
-  {
+
    "id": "100007630",
    "uri": "http://dx.doi.org/10.13039/100007630",
    "name": "College of Engineering and Informatics, National University of Ireland, Galway",
    "synonym": []
  },
  {
    "id": "100007731",
    "uri": "http://dx.doi.org/10.13039/100007731",
@ -58,7 +53,7 @@
    "uri": "http://dx.doi.org/10.13039/100010414",
    "name": "Health Research Board",
    "synonym": [
-      "501100001590"
+      "501100001590", "501100023273"
    ]
  },
  {
@ -85,24 +80,6 @@
    "name": "Irish College of General Practitioners",
    "synonym": []
  },
  {
    "id": "100012734",
    "uri": "http://dx.doi.org/10.13039/100012734",
    "name": "Department for Culture, Heritage and the Gaeltacht, Ireland",
    "synonym": []
  },
  {
    "id": "100012754",
    "uri": "http://dx.doi.org/10.13039/100012754",
    "name": "Horizon Pharma",
    "synonym": []
  },
  {
    "id": "100012891",
    "uri": "http://dx.doi.org/10.13039/100012891",
    "name": "Medical Research Charities Group",
    "synonym": []
  },
  {
    "id": "100012919",
    "uri": "http://dx.doi.org/10.13039/100012919",
@ -233,7 +210,7 @@
    "id": "100018064",
    "uri": "http://dx.doi.org/10.13039/100018064",
    "name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
-    "synonym": []
+    "synonym": ["100012734"]
  },
  {
    "id": "100018172",
@ -281,13 +258,13 @@
    "id": "100019637",
    "uri": "http://dx.doi.org/10.13039/100019637",
    "name": "Horizon Therapeutics",
-    "synonym": []
+    "synonym": ["100012754"]
  },
  {
    "id": "100020174",
    "uri": "http://dx.doi.org/10.13039/100020174",
    "name": "Health Research Charities Ireland",
-    "synonym": []
+    "synonym": ["100012891"]
  },
  {
    "id": "100020202",
@ -319,12 +296,7 @@
    "name": "Centre for Ageing Research and Development in Ireland",
    "synonym": []
  },
-  {
+
    "id": "501100001583",
    "uri": "http://dx.doi.org/10.13039/501100001583",
    "name": "Cystinosis Foundation Ireland",
    "synonym": []
  },
  {
    "id": "501100001584",
    "uri": "http://dx.doi.org/10.13039/501100001584",
@ -455,13 +427,13 @@
    "id": "501100001634",
    "uri": "http://dx.doi.org/10.13039/501100001634",
    "name": "University of Galway",
-    "synonym": []
+    "synonym": ["501100019905", "100007630", "501100020570", "501100023852"]
  },
  {
    "id": "501100001635",
    "uri": "http://dx.doi.org/10.13039/501100001635",
    "name": "University of Limerick",
-    "synonym": []
+    "synonym": ["501100014531"]
  },
  {
    "id": "501100001636",
@ -491,7 +463,7 @@
    "id": "501100002736",
    "uri": "http://dx.doi.org/10.13039/501100002736",
    "name": "Covidien",
-    "synonym": []
+    "synonym": ["501100003956"]
  },
  {
    "id": "501100002755",
@ -521,7 +493,7 @@
    "id": "501100003037",
    "uri": "http://dx.doi.org/10.13039/501100003037",
    "name": "Elan",
-    "synonym": []
+    "synonym": ["501100021694"]
  },
  {
    "id": "501100003496",
@ -541,12 +513,6 @@
    "name": "Irish Institute of Clinical Neuroscience",
    "synonym": []
  },
  {
    "id": "501100003956",
    "uri": "http://dx.doi.org/10.13039/501100003956",
    "name": "Aspect Medical Systems",
    "synonym": []
  },
  {
    "id": "501100004162",
    "uri": "http://dx.doi.org/10.13039/501100004162",
@ -595,17 +561,11 @@
    "name": "Technological University Dublin",
    "synonym": []
  },
  {
    "id": "501100009269",
    "uri": "http://dx.doi.org/10.13039/501100009269",
    "name": "Programme of Competitive Forestry Research for Development",
    "synonym": []
  },
  {
    "id": "501100009315",
    "uri": "http://dx.doi.org/10.13039/501100009315",
    "name": "Cystinosis Ireland",
-    "synonym": []
+    "synonym": ["501100001583"]
  },
  {
    "id": "501100010808",
@ -625,12 +585,6 @@
    "name": "Alimentary Health",
    "synonym": []
  },
  {
    "id": "501100011103",
    "uri": "http://dx.doi.org/10.13039/501100011103",
    "name": "Rann\u00eds",
    "synonym": []
  },
  {
    "id": "501100012354",
    "uri": "http://dx.doi.org/10.13039/501100012354",
@ -679,12 +633,7 @@
    "name": "Irish Centre for High-End Computing",
    "synonym": []
  },
-  {
+
    "id": "501100019905",
    "uri": "http://dx.doi.org/10.13039/501100019905",
    "name": "Galway University Foundation",
    "synonym": []
  },
  {
    "id": "501100020036",
    "uri": "http://dx.doi.org/10.13039/501100020036",
@ -733,12 +682,6 @@
    "name": "Insight SFI Research Centre for Data Analytics",
    "synonym": []
  },
  {
    "id": "501100021694",
    "uri": "http://dx.doi.org/10.13039/501100021694",
    "name": "Elan Pharma International",
    "synonym": []
  },
  {
    "id": "501100021838",
    "uri": "http://dx.doi.org/10.13039/501100021838",
@ -769,12 +712,6 @@
    "name": "Institute of Technology, Tralee",
    "synonym": []
  },
  {
    "id": "501100023273",
    "uri": "http://dx.doi.org/10.13039/501100023273",
    "name": "HRB Clinical Research Facility Galway",
    "synonym": []
  },
  {
    "id": "501100023378",
    "uri": "http://dx.doi.org/10.13039/501100023378",
@ -871,12 +808,7 @@
    "name": "Energy Policy Research Centre, Economic and Social Research Institute",
    "synonym": []
  },
-  {
+
    "id": "501100014531",
    "uri": "http://dx.doi.org/10.13039/501100014531",
    "name": "Physical Education and Sport Sciences Department, University of Limerick",
    "synonym": []
  },
  {
    "id": "501100014745",
    "uri": "http://dx.doi.org/10.13039/501100014745",
@ -889,22 +821,11 @@
    "name": "ADAPT - Centre for Digital Content Technology",
    "synonym": []
  },
-  {
+
    "id": "501100020570",
    "uri": "http://dx.doi.org/10.13039/501100020570",
    "name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
    "synonym": []
  },
  {
    "id": "501100020871",
    "uri": "http://dx.doi.org/10.13039/501100020871",
    "name": "Bernal Institute, University of Limerick",
    "synonym": []
  },
  {
    "id": "501100023852",
    "uri": "http://dx.doi.org/10.13039/501100023852",
    "name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
    "synonym": []
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
@ -48,12 +48,37 @@
            <description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
        </property>
        <property>
            <name>JAVA_HOME</name>
            <value>/srv/java/openjdk-17</value>
            <description>Used to configure the Java home location for oozie.launcher.mapreduce.map.env</description>
        </property>
        <property>
            <name>JAVA_OPTS</name>
                <value>-Dcom.sun.security.enableAIAcaIssuers=true</value>
            <description>Used to configure the JAVA_OPTS parameter</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapreduce.map.env</name>
                <value>JAVA_HOME=${JAVA_HOME}</value>
            </property>
        </configuration>
    </global>
    <start to="collection_mode"/>
@ -99,7 +124,7 @@
    <action name="CollectionWorker">
        <java>
            <main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
-            <java-opts>${collection_java_xmx}</java-opts>
+            <java-opts>${JAVA_OPTS} ${collection_java_xmx}</java-opts>
            <arg>--apidescriptor</arg><arg>${apiDescription}</arg>
            <arg>--namenode</arg><arg>${nameNode}</arg>
            <arg>--workflowId</arg><arg>${workflowId}</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@ -79,23 +79,6 @@ object MagUtility extends Serializable {
  private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
  private val MAGDataInfo: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
    di.setInvisible(false)
    di.setTrust("0.9")
    di.setProvenanceaction(
      OafMapperUtils.qualifier(
        ModelConstants.SYSIMPORT_ACTIONSET,
        ModelConstants.SYSIMPORT_ACTIONSET,
        ModelConstants.DNET_PROVENANCE_ACTIONS,
        ModelConstants.DNET_PROVENANCE_ACTIONS
      )
    )
    di
  }
  private val MAGDataInfoInvisible: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
@ -453,7 +436,6 @@ object MagUtility extends Serializable {
      case "repository" =>
        result = new Publication()
        result.setDataInfo(MAGDataInfoInvisible)
        qualifier(
          "0038",
          "Other literature type",
@ -488,8 +470,7 @@ object MagUtility extends Serializable {
    }
    if (result != null) {
-      if (result.getDataInfo == null)
+      result.setDataInfo(MAGDataInfo)
        result.setDataInfo(MAGDataInfo)
      val i = new Instance
      i.setInstancetype(tp)
      i.setInstanceTypeMapping(
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
      return null
    result.setCollectedfrom(List(MAGCollectedFrom).asJava)
-    val pidList = List(
+    var pidList = List(
      structuredProperty(
        paper.paperId.get.toString,
        qualifier(
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
      )
    )
    result.setPid(pidList.asJava)
    result.setOriginalId(pidList.map(s => s.getValue).asJava)
    result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@ -618,22 +597,23 @@ object MagUtility extends Serializable {
    }
    val instance = result.getInstance().get(0)
-    instance.setPid(pidList.asJava)
+
-    if (paper.doi.orNull != null)
+    if (paper.doi.orNull != null) {
-      instance.setAlternateIdentifier(
+      pidList = pidList ::: List(
-        List(
+        structuredProperty(
-          structuredProperty(
+          paper.doi.get,
-            paper.doi.get,
+          qualifier(
-            qualifier(
+            PidType.doi.toString,
-              PidType.doi.toString,
+            PidType.doi.toString,
-              PidType.doi.toString,
+            ModelConstants.DNET_PID_TYPES,
-              ModelConstants.DNET_PID_TYPES,
+            ModelConstants.DNET_PID_TYPES
-              ModelConstants.DNET_PID_TYPES
+          ),
-            ),
+          null
-            null
+        )
          )
        ).asJava
      )
    }
    instance.setPid(pidList.asJava)
    result.setPid(pidList.asJava)
    instance.setUrl(paper.urls.get.asJava)
    instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
    instance.setCollectedfrom(MAGCollectedFrom)
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
    spark.read
      .load(s"$magBasePath/mag_denormalized")
      .as[MAGPaper]
      .filter(col("doi").isNotNull)
      .map(s => MagUtility.convertMAGtoOAF(s))
      .filter(s => s != null)
      .write
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
 import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
 import eu.dnetlib.dhp.sx.bio.pubmed._
 import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}
-import java.io.InputStream
+import java.io.{ByteArrayInputStream, InputStream}
-import scala.io.Source
+import java.nio.charset.Charset
-import scala.xml.pull.XMLEventReader
+import javax.xml.stream.XMLInputFactory
 object SparkCreateBaselineDataFrame {
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)
-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
+    val targetPath = parser.get("targetPath")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
+    log.info("targetPath: {}", targetPath)
    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
    log.info("outputBasePath: {}", outputBasePath)
    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)
    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )
    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
    val mdStoreSize = df.count
    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed
 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
 import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
 					"-format",
 					"COCI"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@ -75,7 +75,11 @@ public class CreateASTest {
 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
 			.getPath();
 		String blackListPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
 			.getPath();
 		CreateActionSetFromWebEntries
@ -86,7 +90,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
 					"-blackListPath", blackListPath
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -96,7 +101,7 @@ public class CreateASTest {
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
-		Assertions.assertEquals(64, tmp.count());
+		Assertions.assertEquals(58, tmp.count());
 	}
@ -109,6 +114,10 @@ public class CreateASTest {
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
 			.getPath();
 		String blackListPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
 			.getPath();
 		CreateActionSetFromWebEntries
 			.main(
@ -118,7 +127,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
 					"-blackListPath", blackListPath
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -184,7 +194,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getSource()
@ -197,7 +207,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -210,7 +220,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -224,7 +234,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -238,7 +248,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				1, tmp
+				0, tmp
 					.filter(
 						r -> r
 							.getTarget()
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
@ -0,0 +1,64 @@
 package eu.dnetlib.dhp.collection.plugin.file;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Objects;
 import java.util.stream.Stream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.junit.jupiter.MockitoExtension;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class)
 public class FileGZipMultipleNodeTest {
 	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
 	private final ApiDescriptor api = new ApiDescriptor();
 	private FileGZipCollectorPlugin plugin;
 	private static final String SPLIT_ON_ELEMENT = "incollection,article";
 	@BeforeEach
 	public void setUp() throws IOException {
 		final String gzipFile = Objects
 			.requireNonNull(
 				this
 					.getClass()
 					.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
 			.getFile();
 		api.setBaseUrl(gzipFile);
 		HashMap<String, String> params = new HashMap<>();
 		params.put("splitOnElement", SPLIT_ON_ELEMENT);
 		api.setParams(params);
 		FileSystem fs = FileSystem.get(new Configuration());
 		plugin = new FileGZipCollectorPlugin(fs);
 	}
 	@Test
 	void test() throws CollectorException {
 		final Stream<String> stream = plugin.collect(api, new AggregatorReport());
 		stream.limit(10).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			log.info(s);
 		});
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Stream;
 import org.junit.jupiter.api.Assertions;
@ -35,11 +36,11 @@ public class OsfPreprintCollectorTest {
 	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
 	private final String resumptionParam = "page";
-	private final String resumptionType = "page";
+	private final String resumptionType = "scan";
-	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+	private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";
-	private final String resultSizeParam = "";
+	private final String resultSizeParam = "page[size]";
-	private final String resultSizeValue = "";
+	private final String resultSizeValue = "100";
 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -69,11 +70,11 @@ public class OsfPreprintCollectorTest {
 	@Test
 	@Disabled
-	void test() throws CollectorException {
+	void test_limited() throws CollectorException {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
 	@Test
 	@Disabled
 	void test_all() throws CollectorException {
 		final AtomicLong i = new AtomicLong(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
 		stream.forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			if ((i.incrementAndGet() % 1000) == 0) {
 				log.info("COLLECTED: {}", i.get());
 			}
 		});
 		log.info("TOTAL: {}", i.get());
 		Assertions.assertTrue(i.get() > 0);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@ -4,6 +4,11 @@
 package eu.dnetlib.dhp.collection.plugin.rest;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.google.gson.Gson;
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@ -25,18 +32,18 @@ class RestCollectorPluginTest {
 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
-	private final String resumptionType = "count";
+	private final String resumptionType = "discover";
-	private final String resumptionParam = "from";
+	private final String resumptionParam = "skip";
-	private final String entityXpath = "//hits/hits";
+	private final String entityXpath = "//*[local-name()='data']";
-	private final String resumptionXpath = "//hits";
+	private final String resumptionXpath = "";
-	private final String resultTotalXpath = "//hits/total";
+	private final String resultTotalXpath = "//*[local-name()='count']";
-	private final String resultFormatParam = "format";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";
 	private final String protocolDescriptor = "rest_json2xml";
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
 		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
 	@Disabled
 	@Test
 	void testUrl() throws IOException {
 		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
 		URL url = new URL(url_s);
 		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
 		conn.setRequestMethod("GET");
 		conn.setRequestProperty("User-Agent", "OpenAIRE");
 		Gson gson = new Gson();
 		System.out.println("Request header");
 		System.out.println(gson.toJson(conn.getHeaderFields()));
 		InputStream inputStream = conn.getInputStream();
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@ -44,7 +44,7 @@ public class RestIteratorTest {
 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
    {
      "value": "VALUE",
      "type": "PIPPO"
    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
 import org.apache.commons.io.IOUtils
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }
  @Test
  def mappingRecord(): Unit = {
    val input =
      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
@ -18,10 +19,8 @@ class MAGMappingTest {
      .master("local[*]")
      .getOrCreate()
-    val s = new SparkMagOrganizationAS(null, null, null)
+    val s = new SparkMAGtoOAF(null, null, null)
-
+    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
    s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
  }
  @Test
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testEBIData() = {
-    val inputXML = Source
+    val inputFactory = XMLInputFactory.newInstance
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
      .mkString
    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
+    val inputFactory = XMLInputFactory.newInstance
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
-    )
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {
-    val xml = new XMLEventReader(
+    val inputFactory = XMLInputFactory.newInstance
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
@ -26,15 +26,15 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.Software;
-public class PrepareSimpleEntititiesJob {
+public class PrepareSimpleEntitiesJob {
-	private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class);
+	private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntitiesJob.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					PrepareSimpleEntititiesJob.class
+					PrepareSimpleEntitiesJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
@ -160,8 +160,7 @@ public class ConversionUtils {
 			.stream()
 			.filter(Objects::nonNull)
 			.filter(pid -> pid.getQualifier() != null)
-			.filter(pid -> pid.getQualifier().getClassid() != null)
+			.filter(pid -> StringUtils.startsWithIgnoreCase(pid.getQualifier().getClassid(), ModelConstants.ORCID))
 			.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
 			.map(StructuredProperty::getValue)
 			.map(ConversionUtils::cleanOrcid)
 			.filter(StringUtils::isNotBlank)
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
@ -7,7 +7,7 @@
        </property>
        <property>
            <name>outputDir</name>
-            <description>the path where the the generated data will be stored</description>
+            <description>the path where the generated data will be stored</description>
        </property>
 		<property>
            <name>datasourceIdWhitelist</name>
@ -179,17 +179,18 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>PrepareSimpleEntititiesJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class>
+            <class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntitiesJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -209,11 +210,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -234,11 +236,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -258,11 +261,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -282,11 +286,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=10000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -306,11 +311,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=2000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -332,11 +338,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -356,11 +363,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -380,11 +388,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -404,11 +413,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -428,11 +438,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -452,11 +463,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -476,11 +488,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--outputDir</arg><arg>${outputDir}</arg>
@ -503,6 +516,7 @@
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -535,6 +549,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -562,6 +577,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -585,6 +601,7 @@
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcidTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcidTest.java
@ -0,0 +1,66 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.List;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.broker.objects.OaBrokerAuthor;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 class EnrichMissingAuthorOrcidTest {
 	final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
 	@BeforeEach
 	void setUp() throws Exception {
 	}
 	@Test
 	void testFindDifferences_1() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testFindDifferences_2() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
 		target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
 		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
 		assertEquals(1, list.size());
 	}
 	@Test
 	void testFindDifferences_3() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
 		target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
 		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testFindDifferences_4() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
 		target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
 		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
 		assertTrue(list.isEmpty());
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java
@ -2,27 +2,32 @@
 package eu.dnetlib.dhp.broker.oa.util;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Instance;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-class ConversionUtilsTest {
+public class ConversionUtilsTest {
 	@BeforeEach
-	void setUp() throws Exception {
+	public void setUp() throws Exception {
 	}
 	@Test
-	void testAllResultPids() {
+	public void testAllResultPids() {
 		final Qualifier qf = new Qualifier();
 		qf.setClassid("test");
 		qf.setClassname("test");
@ -91,4 +96,42 @@ class ConversionUtilsTest {
 		assertEquals(6, list.size());
 	}
 	public void testOafResultToBrokerResult() {
 		final Author a1 = createAuthor("Michele Artini", "0000-0002-4406-428X");
 		final Author a2 = createAuthor("Claudio Atzori", "http://orcid.org/0000-0001-9613-6639");
 		final Author a3 = createAuthor("Alessia Bardi", null);
 		final Result r = new Result();
 		r.setAuthor(Arrays.asList(a1, a2, a3));
 		final OaBrokerMainEntity br = ConversionUtils.oafResultToBrokerResult(r);
 		assertEquals(3, br.getCreators().size());
 		assertEquals("0000-0002-4406-428X", br.getCreators().get(0).getOrcid());
 		assertEquals("0000-0001-9613-6639", br.getCreators().get(1).getOrcid());
 		assertNull(br.getCreators().get(2).getOrcid());
 	}
 	private Author createAuthor(final String name, final String orcid) {
 		final Author a = new Author();
 		a.setFullname("Michele Artini");
 		if (orcid != null) {
 			final Qualifier q = new Qualifier();
 			q.setClassid(ModelConstants.ORCID);
 			q.setClassname(ModelConstants.ORCID);
 			q.setSchemeid("dnet:pids");
 			q.setSchemename("dnet:pids");
 			final StructuredProperty pid = new StructuredProperty();
 			pid.setQualifier(q);
 			pid.setValue(orcid);
 			a.setPid(Arrays.asList(pid));
 		}
 		return a;
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -53,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
        <dependency>
            <groupId>org.scala-lang.modules</groupId>
            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
            <version>1.0.2</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang.modules</groupId>
            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
            <version>2.11.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>com.arakelian</groupId>
            <artifactId>java-jq</artifactId>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
 						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
@ -203,8 +203,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			WindowSpec w = Window
 				.partitionBy("groupId")
 				.orderBy(
 					col("lastUsage").desc_nulls_last(),
 					col("pidType").asc_nulls_last(),
 					col("lastUsage").desc_nulls_last(),
 					col("collectedfrom").desc_nulls_last(),
 					col("date").asc_nulls_last(),
 					col("id").asc_nulls_last());
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Organization;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
 					"",
-					r._1()._2().getOriginalId().get(0),
+					Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
-					r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
+					Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
-					r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
+					Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
-					r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
+					Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
-					r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
+					Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
-					r._1()._2().getCollectedfrom().get(0).getValue(),
+					Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
 					"",
 					structuredPropertyListToString(r._1()._2().getPid()),
 					parseECField(r._1()._2().getEclegalbody()),
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 					final Organization o = r._2()._2();
 					return new OrgSimRel(
 						r._1()._1(),
-						o.getOriginalId().get(0),
+						Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
 						Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
 					OrgSimRel orgSimRel = r._1()._2();
-					orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+					orgSimRel
 						.setLocal_id(
 							Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
 					return orgSimRel;
 				},
 				Encoders.bean(OrgSimRel.class));
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml
@ -15,4 +15,12 @@
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>pivotHistoryDatabase</name>
        <value>&#x200B;</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
@ -198,6 +198,8 @@
            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
            <arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
        </spark>
        <ok to="PrepareOrgRels"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -0,0 +1,103 @@
 package eu.dnetlib.dhp.oa.dedup;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Serializable;
 import java.lang.reflect.InvocationTargetException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import scala.Tuple2;
 class DatasetMergerTest implements Serializable {
 	private List<Tuple2<String, Dataset>> datasets;
 	private String testEntityBasePath;
 	private DataInfo dataInfo;
 	private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148";
 	private Dataset dataset_top;
 	@BeforeEach
 	public void setUp() throws Exception {
 		testEntityBasePath = Paths
 			.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
 			.toFile()
 			.getAbsolutePath();
 		datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class);
 		dataset_top = getTopPub(datasets);
 		dataInfo = setDI();
 	}
 	@Test
 	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
 		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
 		// verify id
 		assertEquals(dedupId, pub_merged.getId());
 		assertEquals(2, pub_merged.getInstance().size());
 	}
 	public DataInfo setDI() {
 		DataInfo dataInfo = new DataInfo();
 		dataInfo.setTrust("0.9");
 		dataInfo.setDeletedbyinference(false);
 		dataInfo.setInferenceprovenance("testing");
 		dataInfo.setInferred(true);
 		return dataInfo;
 	}
 	public Dataset getTopPub(List<Tuple2<String, Dataset>> publications) {
 		Double maxTrust = 0.0;
 		Dataset maxPub = new Dataset();
 		for (Tuple2<String, Dataset> publication : publications) {
 			Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
 			if (pubTrust > maxTrust) {
 				maxTrust = pubTrust;
 				maxPub = publication._2();
 			}
 		}
 		return maxPub;
 	}
 	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
 		List<Tuple2<String, T>> res = new ArrayList<>();
 		BufferedReader reader;
 		try {
 			reader = new BufferedReader(new FileReader(path));
 			String line = reader.readLine();
 			while (line != null) {
 				res
 					.add(
 						new Tuple2<>(
 							MapDocumentUtil.getJPathString("$.id", line),
 							new ObjectMapper().readValue(line, clazz)));
 				// read next line
 				line = reader.readLine();
 			}
 			reader.close();
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 		return res;
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable {
 		assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
 		assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
 		assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
-		assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
+		assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid());
-		assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
+		assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid());
-		assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
+		assertEquals("Elsevier BV", pub_merged.getPublisher().getValue());
-		assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
+		assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue());
 		assertEquals(pub_top.getResourcetype().getClassid(), "");
 		assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
 		assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
-		assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
+		// assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
 		assertEquals(3, pub_merged.getInstance().size());
 		assertEquals(2, pub_merged.getCountry().size());
 		assertEquals(0, pub_merged.getSubject().size());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json
@ -49,7 +49,7 @@
          },
          {
            "field": "country",
-            "comparator": "exactMatch",
+            "comparator": "countryMatch",
            "weight": 1,
            "countIfUndefined": "true",
            "params": {}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@ -172,7 +172,7 @@ public class SparkBulkTagJob {
 			.option("compression", "gzip")
 			.json(outputPath + "project");
-		readPath(spark, outputPath + "project", Datasource.class)
+		readPath(spark, outputPath + "project", Project.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -50,7 +50,7 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 	 * @param subject
 	 */
 	private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
-												  Subject subject) {
+		Subject subject) {
 		vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
 			if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
@ -61,13 +61,14 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 					subject.getQualifier().setClassname(vocabulary.getName());
 				}
 			} else {
-				final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
+				final String provenanceActionClassId = Optional
-						.map(DataInfo::getProvenanceaction)
+					.ofNullable(subject.getDataInfo())
-						.map(Qualifier::getClassid)
+					.map(DataInfo::getProvenanceaction)
-						.orElse(null);
+					.map(Qualifier::getClassid)
 					.orElse(null);
 				if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
-						!"subject:fos".equals(provenanceActionClassId)) {
+					!"subject:fos".equals(provenanceActionClassId)) {
 					Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
 					VocabularyTerm term = vocabulary.getTerm(subject.getValue());
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 			o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
 			o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
 			o.setCountry(prepareQualifierSplitting(rs.getString("country")));
 			o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology")));
 			o.setDataInfo(info);
 			o.setLastupdatetimestamp(lastUpdateTimestamp);
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -156,6 +156,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -190,6 +191,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -224,6 +226,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -258,6 +261,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -292,6 +296,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -326,6 +331,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -360,6 +366,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -394,6 +401,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -116,17 +116,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=10000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>8000</arg>
+            <arg>--numPartitions</arg><arg>10000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -143,17 +145,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>4000</arg>
+            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -170,11 +174,13 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
@ -197,17 +203,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>300</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -224,17 +232,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=200
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>200</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -251,17 +261,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/organization</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>400</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -278,17 +290,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/project</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -305,17 +319,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/relation</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>10000</arg>
+            <arg>--numPartitions</arg><arg>15000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
@ -45,6 +45,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=15000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -79,6 +80,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=10000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
@ -28,7 +28,8 @@ SELECT
    (array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1]              AS ecinternationalorganization,
    (array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1]                      AS ecenterprise,
    (array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1]                    AS ecsmevalidated,
-    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode
+    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode,
    org_types.name                                                                                              AS typology
 FROM organizations o
 	LEFT OUTER JOIN acronyms a    ON (a.id = o.id)
 	LEFT OUTER JOIN urls u        ON (u.id = o.id)
@ -37,6 +38,7 @@ FROM organizations o
 	LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
    LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
    LEFT OUTER JOIN other_ids idup  ON (od.id = idup.id)
    LEFT OUTER JOIN org_types ON (org_types.val = o.type)
 WHERE
    o.status = 'approved' OR o.status = 'suggested'
 GROUP BY
@ -44,4 +46,5 @@ GROUP BY
 	o.name,
 	o.creation_date,
 	o.modification_date,
-	o.country;
+	o.country,
 	org_types.name;
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@ -0,0 +1,5 @@
 [
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@ -0,0 +1,166 @@
 {
  "cites":{
    "original":"Cites",
    "inverse":"IsCitedBy"
  },
  "compiles":{
    "original":"Compiles",
    "inverse":"IsCompiledBy"
  },
  "continues":{
    "original":"Continues",
    "inverse":"IsContinuedBy"
  },
  "derives":{
    "original":"IsSourceOf",
    "inverse":"IsDerivedFrom"
  },
  "describes":{
    "original":"Describes",
    "inverse":"IsDescribedBy"
  },
  "documents":{
    "original":"Documents",
    "inverse":"IsDocumentedBy"
  },
  "hasmetadata":{
    "original":"HasMetadata",
    "inverse":"IsMetadataOf"
  },
  "hasassociationwith":{
    "original":"HasAssociationWith",
    "inverse":"HasAssociationWith"
  },
  "haspart":{
    "original":"HasPart",
    "inverse":"IsPartOf"
  },
  "hasversion":{
    "original":"HasVersion",
    "inverse":"IsVersionOf"
  },
  "iscitedby":{
    "original":"IsCitedBy",
    "inverse":"Cites"
  },
  "iscompiledby":{
    "original":"IsCompiledBy",
    "inverse":"Compiles"
  },
  "iscontinuedby":{
    "original":"IsContinuedBy",
    "inverse":"Continues"
  },
  "isderivedfrom":{
    "original":"IsDerivedFrom",
    "inverse":"IsSourceOf"
  },
  "isdescribedby":{
    "original":"IsDescribedBy",
    "inverse":"Describes"
  },
  "isdocumentedby":{
    "original":"IsDocumentedBy",
    "inverse":"Documents"
  },
  "isidenticalto":{
    "original":"IsIdenticalTo",
    "inverse":"IsIdenticalTo"
  },
  "ismetadatafor":{
    "original":"IsMetadataFor",
    "inverse":"IsMetadataOf"
  },
  "ismetadataof":{
    "original":"IsMetadataOf",
    "inverse":"IsMetadataFor"
  },
  "isnewversionof":{
    "original":"IsNewVersionOf",
    "inverse":"IsPreviousVersionOf"
  },
  "isobsoletedby":{
    "original":"IsObsoletedBy",
    "inverse":"Obsoletes"
  },
  "isoriginalformof":{
    "original":"IsOriginalFormOf",
    "inverse":"IsVariantFormOf"
  },
  "ispartof":{
    "original":"IsPartOf",
    "inverse":"HasPart"
  },
  "ispreviousversionof":{
    "original":"IsPreviousVersionOf",
    "inverse":"IsNewVersionOf"
  },
  "isreferencedby":{
    "original":"IsReferencedBy",
    "inverse":"References"
  },
  "isrelatedto":{
    "original":"IsRelatedTo",
    "inverse":"IsRelatedTo"
  },
  "isrequiredby":{
    "original":"IsRequiredBy",
    "inverse":"Requires"
  },
  "isreviewedby":{
    "original":"IsReviewedBy",
    "inverse":"Reviews"
  },
  "issourceof":{
    "original":"IsSourceOf",
    "inverse":"IsDerivedFrom"
  },
  "issupplementedby":{
    "original":"IsSupplementedBy",
    "inverse":"IsSupplementTo"
  },
  "issupplementto":{
    "original":"IsSupplementTo",
    "inverse":"IsSupplementedBy"
  },
  "isvariantformof":{
    "original":"IsVariantFormOf",
    "inverse":"IsOriginalFormOf"
  },
  "isversionof":{
    "original":"IsVersionOf",
    "inverse":"HasVersion"
  },
  "obsoletes":{
    "original":"Obsoletes",
    "inverse":"IsObsoletedBy"
  },
  "references":{
    "original":"References",
    "inverse":"IsReferencedBy"
  },
  "requires":{
    "original":"Requires",
    "inverse":"IsRequiredBy"
  },
  "related":{
    "original":"IsRelatedTo",
    "inverse":"IsRelatedTo"
  },
  "reviews":{
    "original":"Reviews",
    "inverse":"IsReviewedBy"
  },
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
  },
  "isamongtopnsimilardocuments": {
    "original": "IsAmongTopNSimilarDocuments",
    "inverse": "HasAmongTopNSimilarDocuments"
  },
  "hasamongtopnsimilardocuments": {
    "original": "HasAmongTopNSimilarDocuments",
    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
@ -25,27 +25,38 @@ object SparkApplyHostedByMapToResult {
          val i = p.getInstance().asScala
          if (i.size == 1) {
            val inst: Instance = i.head
-            inst.getHostedby.setKey(ei.getHostedById)
+            patchInstance(p, ei, inst)
            inst.getHostedby.setValue(ei.getName)
            if (ei.getOpenAccess) {
              inst.setAccessright(
                OafMapperUtils.accessRight(
                  ModelConstants.ACCESS_RIGHT_OPEN,
                  "Open Access",
                  ModelConstants.DNET_ACCESS_MODES,
                  ModelConstants.DNET_ACCESS_MODES
                )
              )
              inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
              p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
            }
          } else {
            val cf = i.map(ii => ii.getCollectedfrom.getValue)
            if (cf.contains("Crossref")) {
              i.foreach(ii => {
                patchInstance(p, ei, ii)
              })
            }
          }
        }
        p
      })(Encoders.bean(classOf[Publication]))
  }
  private def patchInstance(p: Publication, ei: EntityInfo, inst: Instance): Unit = {
    inst.getHostedby.setKey(ei.getHostedById)
    inst.getHostedby.setValue(ei.getName)
    if (ei.getOpenAccess) {
      inst.setAccessright(
        OafMapperUtils.accessRight(
          ModelConstants.ACCESS_RIGHT_OPEN,
          "Open Access",
          ModelConstants.DNET_ACCESS_MODES,
          ModelConstants.DNET_ACCESS_MODES
        )
      )
      inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
      p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
    }
  }
  def main(args: Array[String]): Unit = {
    val logger: Logger = LoggerFactory.getLogger(getClass)
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@ -0,0 +1,258 @@
 package eu.dnetlib.dhp.sx.graph
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
 import eu.dnetlib.dhp.schema.sx.scholix.{
  Scholix,
  ScholixCollectedFrom,
  ScholixEntityId,
  ScholixIdentifier,
  ScholixRelationship,
  ScholixResource
 }
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import scala.collection.JavaConverters._
 import scala.io.Source
 case class RelationInfo(
  source: String,
  target: String,
  relclass: String,
  id: String,
  collectedfrom: Seq[RelKeyValue]
 ) {}
 case class RelKeyValue(key: String, value: String) {}
 object ScholexplorerUtils {
  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
  val mapper = new ObjectMapper()
  case class RelationVocabulary(original: String, inverse: String) {}
  val relations: Map[String, RelationVocabulary] = {
    val input = Source
      .fromInputStream(
        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
      )
      .mkString
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input)
    json.extract[Map[String, RelationVocabulary]]
  }
  def invRel(rel: String): String = {
    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
    if (semanticRelation != null)
      semanticRelation.inverse
    else
      null
  }
  def generateDatasourceOpenAIREURLS(id: String): String = {
    if (id != null && id.length > 12)
      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
    else
      null
  }
  def findURLForPID(
    pidValue: List[StructuredProperty],
    urls: List[String]
  ): List[(StructuredProperty, String)] = {
    pidValue.map { p =>
      val pv = p.getValue
      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
      (p, r.orNull)
    }
  }
  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
    if (r.getInstance() == null || r.getInstance().isEmpty)
      return List()
    r.getInstance()
      .asScala
      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
      .filter(i => i.getPid != null && i.getUrl != null)
      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
      .distinct
      .toList
  }
  def generateScholixResourceFromResult(result: Result): ScholixResource = {
    if (result.getInstance() == null || result.getInstance().size() == 0)
      return null
    if (result.getPid == null || result.getPid.isEmpty)
      return null
    val r = new ScholixResource
    r.setDnetIdentifier(result.getId)
    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
    if (persistentIdentifiers.isEmpty)
      return null
    r.setIdentifier(persistentIdentifiers.asJava)
    r.setObjectType(result.getResulttype.getClassid)
    r.setObjectSubType(
      result
        .getInstance()
        .asScala
        .filter(i => i != null && i.getInstancetype != null)
        .map(i => i.getInstancetype.getClassname)
        .distinct
        .head
    )
    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
      if (titles.nonEmpty)
        r.setTitle(titles.head)
      else
        return null
    }
    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
      val authors: List[ScholixEntityId] =
        result.getAuthor.asScala
          .map(a => {
            val entity = new ScholixEntityId()
            entity.setName(a.getFullname)
            if (a.getPid != null && a.getPid.size() > 0)
              entity.setIdentifiers(
                a.getPid.asScala
                  .map(sp => {
                    val id = new ScholixIdentifier()
                    id.setIdentifier(sp.getValue)
                    id.setSchema(sp.getQualifier.getClassid)
                    id
                  })
                  .take(3)
                  .toList
                  .asJava
              )
            entity
          })
          .toList
      if (authors.nonEmpty)
        r.setCreator(authors.asJava)
    }
    val dt: List[String] = result
      .getInstance()
      .asScala
      .filter(i => i.getDateofacceptance != null)
      .map(i => i.getDateofacceptance.getValue)
      .toList
    if (dt.nonEmpty)
      r.setPublicationDate(dt.distinct.head)
    r.setPublisher(
      result
        .getInstance()
        .asScala
        .map(i => i.getHostedby)
        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
        .map(h => {
          val eid = new ScholixEntityId()
          eid.setName(h.getValue)
          val id = new ScholixIdentifier()
          id.setIdentifier(h.getKey)
          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
          eid.setIdentifiers(List(id).asJava)
          eid
        })
        .distinct
        .asJava
    )
    r.setCollectedFrom(
      result.getCollectedfrom.asScala
        .map(cf => {
          val scf = new ScholixCollectedFrom()
          scf.setProvisionMode("collected")
          scf.setCompletionStatus("complete")
          val eid = new ScholixEntityId()
          eid.setName(cf.getValue)
          val id = new ScholixIdentifier()
          id.setIdentifier(cf.getKey)
          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
          eid.setIdentifiers(List(id).asJava)
          scf.setProvider(eid)
          scf
        })
        .asJava
    )
    r
  }
  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
    val s: Scholix = new Scholix
    s.setSource(source)
    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
      s.setLinkprovider(
        relation.collectedfrom
          .map(cf => {
            val eid = new ScholixEntityId()
            eid.setName(cf.value)
            val id = new ScholixIdentifier()
            id.setIdentifier(cf.key)
            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
            eid.setIdentifiers(List(id).asJava)
            eid
          })
          .toList
          .asJava
      )
    else {
      val eid = new ScholixEntityId()
      eid.setName("OpenAIRE")
      val id = new ScholixIdentifier()
      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
      eid.setIdentifiers(List(id).asJava)
      s.setLinkprovider(List(eid).asJava)
    }
    s.setIdentifier(relation.id)
    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
    if (semanticRelation == null)
      return null
    s.setRelationship(
      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
    )
    s.setPublicationDate(source.getPublicationDate)
    s.setPublisher(source.getPublisher)
    val mockTarget = new ScholixResource
    mockTarget.setDnetIdentifier(relation.target)
    s.setTarget(mockTarget)
    s
  }
  def updateTarget(s: Scholix, t: ScholixResource): String = {
    s.setTarget(t)
    val spublishers: Seq[ScholixEntityId] =
      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
    val tpublishers: Seq[ScholixEntityId] =
      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
    s.setPublisher(mergedPublishers.asJava)
    mapper.writeValueAsString(s)
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@ -0,0 +1,141 @@
 package eu.dnetlib.dhp.sx.graph
 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.schema.oaf.{
  KeyValue,
  OtherResearchProduct,
  Publication,
  Relation,
  Result,
  Software,
  Dataset => OafDataset
 }
 import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
 import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
  /** Here all the spark applications runs this method
    * where the whole logic of the spark node is defined
    */
  override def run(): Unit = {
    val sourcePath = parser.get("sourcePath")
    log.info("sourcePath: {}", sourcePath)
    val targetPath = parser.get("targetPath")
    log.info("targetPath: {}", targetPath)
    generateBidirectionalRelations(sourcePath, targetPath, spark)
    generateScholixResource(sourcePath, targetPath, spark)
    generateScholix(targetPath, spark)
  }
  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
    val entityMap: Map[String, StructType] = Map(
      "publication"          -> Encoders.bean(classOf[Publication]).schema,
      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
      "software"             -> Encoders.bean(classOf[Software]).schema,
      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
    )
    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
    val resDs = spark.emptyDataset[ScholixResource]
    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
      println(s"adding ${item._1}")
      res.union(
        spark.read
          .schema(item._2)
          .json(s"$inputPath/${item._1}")
          .as[Result]
          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
          .filter(s => s != null)
      )
    })
    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
  }
  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
    val relSchema = Encoders.bean(classOf[Relation]).schema
    val relDF = spark.read
      .schema(relSchema)
      .json(s"$inputPath/relation")
      .where(
        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
      )
      .select("source", "target", "collectedfrom", "relClass")
    def invRel: String => String = { s =>
      ScholexplorerUtils.invRel(s)
    }
    import org.apache.spark.sql.functions.udf
    val inverseRelationUDF = udf(invRel)
    val inverseRelation = relDF.select(
      col("target").alias("source"),
      col("source").alias("target"),
      col("collectedfrom"),
      inverseRelationUDF(col("relClass")).alias("relClass")
    )
    val bidRel = inverseRelation
      .union(relDF)
      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
      .drop("collectedfrom")
      .withColumnRenamed("cf", "collectedfrom")
      .groupBy(col("id"))
      .agg(
        first("source").alias("source"),
        first("target").alias("target"),
        first("relClass").alias("relClass"),
        first("collectedfrom").alias("collectedfrom")
      )
    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
  }
  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
    import spark.implicits._
    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
    val scholix_one_verse = relations
      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
    val resourceTarget = relations
      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
    scholix_one_verse
      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
      .write
      .mode(SaveMode.Overwrite)
      .option("compression", "gzip")
      .text(s"$outputPath/scholix")
  }
 }
 object SparkCreateScholexplorerDump {
  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
  def main(args: Array[String]): Unit = {
    new SparkCreateScholexplorerDump(
      log = logger,
      args = args,
      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
    ).initialize().run()
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@ -0,0 +1,26 @@
 package eu.dnetlib.dhp.sx.graph.scholix
 import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
 import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.Test
 import org.objenesis.strategy.StdInstantiatorStrategy
 class ScholixGenerationTest {
  @Test
  def generateScholix(): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
    val app = new SparkCreateScholexplorerDump(null, null, null)
 //   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
 //    app.generateBidirectionalRelations(
 //      "/home/sandro/Downloads/scholix_sample/",
 //      "/home/sandro/Downloads/scholix/",
 //      spark
 //    )
    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
  }
 }
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -18,7 +18,7 @@
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
+                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
@ -59,12 +59,6 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -160,6 +154,26 @@
                    <groupId>org.apache.zookeeper</groupId>
                    <artifactId>zookeeper</artifactId>
                </exclusion>
                <exclusion>
                    <artifactId>ant</artifactId>
                    <groupId>org.apache.ant</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>antlr4-runtime</artifactId>
                    <groupId>org.antlr</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>woodstox-core</artifactId>
                    <groupId>com.fasterxml.woodstox</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>log4j</artifactId>
                    <groupId>*</groupId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.logging.log4j</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
@ -206,5 +220,90 @@
    </dependencies>
    <profiles>
        <profile>
            <id>spark-24</id>
            <activation>
                <activeByDefault>true</activeByDefault>
            </activation>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-3</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>spark-34</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-4</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>spark-35</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-4</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
    </profiles>
 </project>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
@ -31,7 +31,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.provision.XmlConverterJob;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
@ -48,7 +47,7 @@ public class IrishOaiExporterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
+					IrishOaiExporterJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
 		parser.parseArgument(args);
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getTitle()
 						.stream()
 						.filter(t -> StringUtils.isNotBlank(t.getValue()))
 						.findFirst()
 						.map(StructuredProperty::getValue)
 						.ifPresent(
-							title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
+							title -> {
 								re.setTitle(title);
 								re
 									.getTitle()
 									.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
 							});
 				}
 				if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
 					result
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
 import static org.apache.spark.sql.functions.*;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.expressions.UserDefinedFunction;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -45,9 +37,9 @@ import scala.Tuple2;
 /**
 * XmlConverterJob converts the JoinedEntities as XML records
 */
-public class XmlConverterJob {
+public class PayloadConverterJob {
-	private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
+	private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);
 	public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
@ -56,8 +48,8 @@ public class XmlConverterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
+					PayloadConverterJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
+						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
@ -72,6 +64,12 @@ public class XmlConverterJob {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		final Boolean validateXML = Optional
 			.ofNullable(parser.get("validateXML"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.FALSE);
 		log.info("validateXML: {}", validateXML);
 		final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
 		log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
@ -86,18 +84,19 @@ public class XmlConverterJob {
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			removeOutputDir(spark, outputPath);
-			convertToXml(
+			createPayloads(
 				spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
-				VocabularyGroup.loadVocsFromIS(isLookup));
+				VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
 		});
 	}
-	private static void convertToXml(
+	private static void createPayloads(
 		final SparkSession spark,
 		final String inputPath,
 		final String outputPath,
 		final ContextMapper contextMapper,
-		final VocabularyGroup vocabularies) {
+		final VocabularyGroup vocabularies,
 		final Boolean validateXML) {
 		final XmlRecordFactory recordFactory = new XmlRecordFactory(
 			prepareAccumulators(spark.sparkContext()),
@ -118,7 +117,7 @@ public class XmlConverterJob {
 			.as(Encoders.kryo(JoinedEntity.class))
 			.map(
 				(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
-					recordFactory.build(je),
+					recordFactory.build(je, validateXML),
 					ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
 			.map(
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@ -2,42 +2,34 @@
 package eu.dnetlib.dhp.oa.provision;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static org.apache.spark.sql.functions.col;
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.expressions.Aggregator;
+import org.apache.spark.sql.expressions.Window;
 import org.apache.spark.sql.expressions.WindowSpec;
 import org.apache.spark.sql.functions;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
 import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import scala.Tuple2;
 /**
 * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
@ -130,132 +122,36 @@ public class PrepareRelationsJob {
 	private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
 		Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
-		JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
+		WindowSpec source_w = Window
-			.filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved")))
+			.partitionBy("source", "subRelType")
-			.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
+			.orderBy(col("target").desc_nulls_last());
 			.filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())));
-		JavaRDD<Relation> pruned = pruneRels(
+		WindowSpec target_w = Window
-			pruneRels(
+			.partitionBy("target", "subRelType")
-				rels,
+			.orderBy(col("source").desc_nulls_last());
 				sourceMaxRelations, relPartitions, (Function<Relation, String>) Relation::getSource),
 			targetMaxRelations, relPartitions, (Function<Relation, String>) Relation::getTarget);
 		spark
 			.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
 			.repartition(relPartitions)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.parquet(outputPath);
 	}
 	private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
 		int relPartitions, Function<Relation, String> idFn) {
 		return rels
 			.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
 			.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
 			.groupBy(Tuple2::_1)
 			.map(Tuple2::_2)
 			.map(t -> Iterables.limit(t, maxRelations))
 			.flatMap(Iterable::iterator)
 			.map(Tuple2::_2);
 	}
 	// experimental
 	private static void prepareRelationsDataset(
 		SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
 		int relPartitions) {
 		spark
 			.read()
-			.textFile(inputRelationsPath)
+			.schema(Encoders.bean(Relation.class).schema())
-			.repartition(relPartitions)
+			.json(inputRelationsPath)
-			.map(
+			.where("source NOT LIKE 'unresolved%' AND  target  NOT LIKE 'unresolved%'")
-				(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
+			.where("datainfo.deletedbyinference != true")
-				Encoders.kryo(Relation.class))
+			.where(
-			.filter((FilterFunction<Relation>) rel -> !rel.getDataInfo().getDeletedbyinference())
+				relationFilter.isEmpty() ? ""
-			.filter((FilterFunction<Relation>) rel -> !relationFilter.contains(rel.getRelClass()))
+					: "lower(relClass) NOT IN ("
-			.groupByKey(
+						+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
-				(MapFunction<Relation, String>) Relation::getSource,
+			.withColumn("source_w_pos", functions.row_number().over(source_w))
-				Encoders.STRING())
+			.where("source_w_pos < " + sourceMaxRelations)
-			.agg(new RelationAggregator(maxRelations).toColumn())
+			.drop("source_w_pos")
-			.flatMap(
+			.withColumn("target_w_pos", functions.row_number().over(target_w))
-				(FlatMapFunction<Tuple2<String, RelationList>, Relation>) t -> Iterables
+			.where("target_w_pos < " + targetMaxRelations)
-					.limit(t._2().getRelations(), maxRelations)
+			.drop("target_w_pos")
-					.iterator(),
+			.coalesce(relPartitions)
 				Encoders.bean(Relation.class))
 			.repartition(relPartitions)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.parquet(outputPath);
 	}
 	public static class RelationAggregator
 		extends Aggregator<Relation, RelationList, RelationList> {
 		private final int maxRelations;
 		public RelationAggregator(int maxRelations) {
 			this.maxRelations = maxRelations;
 		}
 		@Override
 		public RelationList zero() {
 			return new RelationList();
 		}
 		@Override
 		public RelationList reduce(RelationList b, Relation a) {
 			b.getRelations().add(a);
 			return getSortableRelationList(b);
 		}
 		@Override
 		public RelationList merge(RelationList b1, RelationList b2) {
 			b1.getRelations().addAll(b2.getRelations());
 			return getSortableRelationList(b1);
 		}
 		@Override
 		public RelationList finish(RelationList r) {
 			return getSortableRelationList(r);
 		}
 		private RelationList getSortableRelationList(RelationList b1) {
 			RelationList sr = new RelationList();
 			sr
 				.setRelations(
 					b1
 						.getRelations()
 						.stream()
 						.limit(maxRelations)
 						.collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator()))));
 			return sr;
 		}
 		@Override
 		public Encoder<RelationList> bufferEncoder() {
 			return Encoders.kryo(RelationList.class);
 		}
 		@Override
 		public Encoder<RelationList> outputEncoder() {
 			return Encoders.kryo(RelationList.class);
 		}
 	}
 	/**
 	 * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
 	 * file,
 	 *
 	 * @param spark
 	 * @param inputPath
 	 * @return the JavaRDD<SortableRelation> containing all the relationships
 	 */
 	private static JavaRDD<Relation> readPathRelationRDD(
 		SparkSession spark, final String inputPath) {
 		JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
 	}
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
@ -14,4 +14,7 @@ public class ProvisionConstants {
 		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
 	}
 	public static final String PUBLIC_ALIAS_NAME = "public";
 	public static final String SHADOW_ALIAS_NAME = "shadow";
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java
@ -1,44 +0,0 @@
 package eu.dnetlib.dhp.oa.provision;
 import java.util.Comparator;
 import java.util.Map;
 import java.util.Optional;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class RelationComparator implements Comparator<Relation> {
 	private static final Map<String, Integer> weights = Maps.newHashMap();
 	static {
 		weights.put(ModelConstants.OUTCOME, 0);
 		weights.put(ModelConstants.SUPPLEMENT, 1);
 		weights.put(ModelConstants.REVIEW, 2);
 		weights.put(ModelConstants.CITATION, 3);
 		weights.put(ModelConstants.AFFILIATION, 4);
 		weights.put(ModelConstants.RELATIONSHIP, 5);
 		weights.put(ModelConstants.PUBLICATION_DATASET, 6);
 		weights.put(ModelConstants.SIMILARITY, 7);
 		weights.put(ModelConstants.PROVISION, 8);
 		weights.put(ModelConstants.PARTICIPATION, 9);
 		weights.put(ModelConstants.DEDUP, 10);
 	}
 	private Integer getWeight(Relation o) {
 		return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
 	}
 	@Override
 	public int compare(Relation o1, Relation o2) {
 		return ComparisonChain
 			.start()
 			.compare(getWeight(o1), getWeight(o2))
 			.result();
 	}
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Claudio Atzori	38f8ed27fd	[graph provision] log the Solr admin application operations for alias deletion and creation	2024-07-15 16:30:43 +02:00
Claudio Atzori	1fb44198fb	renamed workflow to better reflect its purpose	2024-07-15 15:24:38 +02:00
Claudio Atzori	6f6e85ddf4	code formatting	2024-07-15 09:32:04 +02:00
Claudio Atzori	7fa3d51200	renamed class, updated criteria to consider the ORCIDs used in the matchers	2024-07-15 09:18:58 +02:00
Michele Artini	f99fb21040	tests	2024-07-15 09:18:46 +02:00
Claudio Atzori	e17edb2581	[broker] fine tuned the workflow memory settings	2024-07-12 10:27:50 +02:00
Claudio Atzori	61d1fa9b9f	[metadata collection] added -Dcom.sun.security.enableAIAcaIssuers=true as a default for metadata collection	2024-07-12 10:26:45 +02:00
Claudio Atzori	f9ed2ae33c	[metadata collection] added the possibility to specify the JAVA_HOME and the JAVA_OPTS parameters	2024-07-11 15:32:36 +02:00
Miriam Baglioni	814e650e12	[Irish Tender]changed the irish.json file according to comments #26 , #29 , and #34 for 9635	2024-07-04 12:24:28 +02:00
Claudio Atzori	1180d78b71	make entity level pids unique by pidType:pidValue	2024-07-04 09:41:12 +02:00
Claudio Atzori	7d3292551b	ignore dates containing 'null's	2024-07-02 15:44:31 +02:00
Claudio Atzori	c7634c55c7	Merge pull request '[beta] implementation of countryMatch and addition of workflow parameters' (#451 ) from openorgs_fixes into beta Reviewed-on: D-Net/dnet-hadoop#451	2024-07-01 09:22:56 +02:00
Michele De Bonis	a10e8d9f05	implementation of countryMatch and addition of workflow parameters	2024-06-28 16:46:52 +02:00
Claudio Atzori	14539f9c8b	[graph provision] publicFormat worfklow parameter defined as optional	2024-06-28 14:55:18 +02:00
Claudio Atzori	1bc8c5d173	[graph provision] fixed serialization of the instancetypes	2024-06-28 14:54:28 +02:00
Claudio Atzori	1ccf01cdb8	Using the updated Solr JSON payload model classes	2024-06-28 12:38:07 +02:00
Claudio Atzori	b79cb155ba	Merge pull request 'Fix permissions-issue in Stats-workflow, step22a-createPDFsAggregated.' (#450 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#450	2024-06-26 10:11:34 +02:00
Claudio Atzori	33a02c5b9e	Merge pull request 'Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to converge to records having DOI pid' (#446 ) from pivotselectionbypid into beta Reviewed-on: D-Net/dnet-hadoop#446	2024-06-26 10:10:13 +02:00
Claudio Atzori	1182bca9eb	Merge pull request 'Add support to cretate/update solr collection aliases' (#449 ) from 9872-create-solr-collection-aliases into beta Reviewed-on: D-Net/dnet-hadoop#449	2024-06-26 10:09:51 +02:00
Claudio Atzori	1c30eacac2	updated index feeding procedure to exploit the collection aliases	2024-06-25 15:27:38 +02:00
Claudio Atzori	6055212f77	merged from the json_payload branch	2024-06-25 12:39:02 +02:00
Claudio Atzori	0031cf849e	Merge branch 'beta' into 9872-create-solr-collection-aliases	2024-06-25 09:58:01 +02:00
Serafeim Chatzopoulos	9f6e16a03c	Add support to cretate/update solr collection aliases	2024-06-20 16:03:15 +03:00
Lampros Smyrnaios	66cd28f70a	- Fix not using the "export HADOOP_USER_NAME" statement in "createPDFsAggregated.sh", which caused permission-issues when creating tables with Impala. - Remove unused "--user" parameter in "impala-shell" calls. - Code polishing.	2024-06-20 14:33:46 +03:00
Lampros Smyrnaios	c6b1ab2a18	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-06-20 14:33:05 +03:00
Miriam Baglioni	d35edac212	[IrishFunderList]make changed according to 9635 comment 20, 21, 22 and 23	2024-06-20 12:28:28 +02:00
Miriam Baglioni	6421f8fece	Merge remote-tracking branch 'origin/beta' into beta	2024-06-19 11:12:15 +02:00
Miriam Baglioni	ac270f795b	[IrishFunderList]make changed according to 9635 comment 14, 15 and 16	2024-06-19 11:11:52 +02:00
Lampros Smyrnaios	236aed8954	Merge remote-tracking branch 'origin/beta' into beta	2024-06-18 17:12:35 +03:00
Claudio Atzori	dd541f8cf5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster.' (#447 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#447	2024-06-18 15:52:30 +02:00
Lampros Smyrnaios	ff335578ea	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-06-18 14:52:31 +03:00
Lampros Smyrnaios	285416c74e	Merge branch 'beta' into beta	2024-06-18 13:50:38 +02:00
Lampros Smyrnaios	3095047e5e	Miscellaneous updates to the copying operation to Impala Cluster: - Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false". - Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR". - Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep". - Add error-handling for "compute stats" errors.	2024-06-18 14:40:41 +03:00
Antonis Lempesis	0456f1b788	Merge remote-tracking branch 'origin/beta' into beta	2024-06-14 15:11:30 +03:00
Antonis Lempesis	38636942c7	filtering out deletedbyinference and invinsible results from accessroute	2024-06-14 15:11:19 +03:00
Lampros Smyrnaios	d942a1101b	Miscellaneous updates to the copying operation to Impala Cluster: - Show some counts and the elapsed time for various sub-tasks. - Code polishing.	2024-06-14 12:14:38 +03:00
Giambattista Bloisi	9bf2bda1c6	Fix: next returned a null value at end of stream	2024-06-12 13:28:51 +02:00
Giambattista Bloisi	d90cb099b8	Fix for paginationStart parameter management	2024-06-11 20:23:44 +02:00
Giambattista Bloisi	4f2a61e10f	Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to slowly converge to records having DOI pid	2024-06-11 15:33:56 +02:00
Claudio Atzori	11fe3a4fe0	[graph resolution] use sparkExecutorMemory to define also the memoryOverhead	2024-06-11 14:21:17 +02:00
Claudio Atzori	a8d68c9d29	avoid NPEs	2024-06-11 14:19:24 +02:00
Miriam Baglioni	8fe934810f	Merge remote-tracking branch 'origin/beta' into beta	2024-06-11 10:28:51 +02:00
Miriam Baglioni	9da006e98c	[SDGFoSActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance	2024-06-11 10:28:32 +02:00
Giambattista Bloisi	85c1eae7e0	Fixes for pagination strategy looping at end of download	2024-06-10 19:03:58 +02:00
Claudio Atzori	b0eba210c0	[actionset promotion] use sparkExecutorMemory to define also the memoryOverhead	2024-06-10 16:15:24 +02:00
Claudio Atzori	3776327a8c	hostedby patching to work with the updated Crossref contents, resolved conflict	2024-06-10 15:24:12 +02:00
Claudio Atzori	0139f23d66	Merge pull request 'organization type from OpenOrgs' (#445 ) from import_openorg_type into beta Reviewed-on: D-Net/dnet-hadoop#445	2024-06-07 12:17:31 +02:00
Michele Artini	c726572418	changed some parameters in OSF test	2024-06-07 12:03:26 +02:00
Claudio Atzori	ec79405cc9	[graph raw] set organization type from openorgs	2024-06-07 11:30:31 +02:00
Miriam Baglioni	1477406ecc	[bulkTag] fixed issue that made project disappear in graph_10_enriched	2024-06-06 10:45:41 +02:00
Claudio Atzori	92c3abd5a4	[graph cleaning] use sparkExecutorMemory to define also the memoryOverhead	2024-06-06 10:44:33 +02:00
Claudio Atzori	ce2364743a	applying changes from PR#442: Fix for missing collectedfrom after dedup	2024-06-06 10:43:43 +02:00
Claudio Atzori	f70dc76b61	minor	2024-06-06 10:43:10 +02:00
Claudio Atzori	73bd1938a5	[graph2hive] use sparkExecutorMemory to define also the memoryOverhead	2024-06-05 12:17:35 +02:00
Claudio Atzori	da5c1e73a4	Merge pull request 'Irish oaipmh exporter' (#443 ) from irish-oaipmh-exporter into beta Reviewed-on: D-Net/dnet-hadoop#443	2024-06-05 10:55:09 +02:00
Claudio Atzori	a02f3f0d2b	code formatting	2024-05-30 10:21:18 +02:00
Alessia Bardi	eadfd8d71d	Merge pull request 'Updated XMLIterator for splitting on different nodes' (#436 ) from dblp_collection_plugin into beta Reviewed-on: D-Net/dnet-hadoop#436	2024-05-29 16:05:06 +02:00
Alessia Bardi	05ee783c07	Merge branch 'beta' into dblp_collection_plugin	2024-05-29 16:04:39 +02:00
Alessia Bardi	fe9fb59c90	Merge pull request 'Rest collector plugin on hadoop supports a new param to pass request headers' (#441 ) from rest-collector-request-header-map into beta Reviewed-on: D-Net/dnet-hadoop#441	2024-05-29 15:54:39 +02:00
Claudio Atzori	c272c4ad68	code formatting	2024-05-29 15:50:07 +02:00
Alessia Bardi	c5f4da16a4	Merge branch 'beta' into rest-collector-request-header-map	2024-05-29 15:46:23 +02:00
Alessia	1b165a14a0	Rest collector plugin on hadoop supports a new param to pass request headers	2024-05-29 15:41:36 +02:00
Michele Artini	e996787be2	OSF test	2024-05-29 15:05:17 +02:00
Claudio Atzori	62716141c5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster' (#440 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#440	2024-05-29 14:34:51 +02:00
Miriam Baglioni	5d85b70e1f	[NOAMI] removed Ireland funder id 501100011103. ticket 9635	2024-05-29 11:55:00 +02:00
Lampros Smyrnaios	e3f28338c1	Miscellaneous updates to the copying operation to Impala Cluster: - Assign the WRITE and EXECUTE permissions to the DBs' HDFS-directories, in order to be able to create tables on top of them, in the Impala Cluster. - Make sure the "copydb" function returns early, when it encounters a fatal error, while respecting the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config.	2024-05-28 17:51:45 +03:00
Giambattista Bloisi	73316d8c83	Add jaxb and jaxws dependencies when compiling with spark-34 profile as they are required to run with jdk > 8	2024-05-28 14:14:51 +02:00
Miriam Baglioni	75d5ddb999	Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter	2024-05-27 12:01:28 +02:00
Miriam Baglioni	87c9c61b41	Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring	2024-05-27 12:01:16 +02:00
Miriam Baglioni	b55fed09f8	Update to include a blackList that filters out the results we know are wrongly associated to IE	2024-05-27 12:01:01 +02:00
Claudio Atzori	107d958b89	[org dedup] avoid NPEs in SparkPrepareNewOrgs	2024-05-27 11:59:54 +02:00
Claudio Atzori	3a7a6ecc32	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:45 +02:00
Claudio Atzori	1af4224d3d	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:33 +02:00
Claudio Atzori	0d5bdb2db0	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-27 11:59:02 +02:00
Claudio Atzori	66548e6a83	Merge pull request 'changes in copy script' (#438 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#438	2024-05-27 11:54:03 +02:00
Antonis Lempesis	15b54a345a	added fos lvl4	2024-05-24 13:21:28 +03:00
Lampros Smyrnaios	b48ed6e617	Change configuration in the copy-operation to Impala Cluster: Set the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" parameter to "false".	2024-05-23 16:58:12 +03:00
Lampros Smyrnaios	68322843e2	Small updates to the copy-operation to Impala Cluster: - Add a configuration-"switch" to control whether the script exits upon an error or not. - Allow the script to exit when a table could not be created. - Show the elapsed time for processing each database.	2024-05-23 15:07:49 +03:00
Lampros Smyrnaios	c7b32bbacc	Update CopyDataToImpalaCluster: Update the code of acquiring the entities from Ocean cluster, through hive, in order to optimize the process and account for additional reserved keywords in Impala. Co-authored-by: Antonis Lempesis <antleb@di.uoa.gr>	2024-05-23 13:00:19 +03:00
Giambattista Bloisi	1b2357e10a	Merge pull request 'Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12' (#327 ) from spark34-integration into beta Reviewed-on: D-Net/dnet-hadoop#327	2024-05-23 09:20:28 +02:00
Sandro La Bruzzo	f1fe363b19	merged again from beta (I hope for the last time)	2024-05-22 11:08:52 +02:00
Sandro La Bruzzo	66c1ffc866	merged again from beta (I hope for the last time)	2024-05-22 11:02:46 +02:00
Claudio Atzori	1ea67eba82	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-21 13:48:48 +02:00
Claudio Atzori	f9fb2fef6e	Merge pull request 'Modification of Microsoft Academic Graph Mapping' (#435 ) from mag_only_doi into beta Reviewed-on: D-Net/dnet-hadoop#435	2024-05-21 13:48:42 +02:00
Claudio Atzori	834461ba26	[graph provision]fixed wf definition, revised serialization of the usage counts measures	2024-05-21 13:48:06 +02:00
Sandro La Bruzzo	e8a61d5dd5	removed plugin, use only FileGZip plugin	2024-05-21 13:45:29 +02:00
Sandro La Bruzzo	ca9414b737	Implement multiple node name splitter on GZipCollectorPlugin and all nodes that use XMLIterator. If the splitter name contains is a comma separated values it splits for all the values	2024-05-21 09:11:13 +02:00
Sandro La Bruzzo	032bcc8279	since last beta workflow we decide to introduce in the graph only MAG item with DOI and set them invisible ( this should be the same behaviour of the previous DOIBoost mapping). This commit apply this type of mapping	2024-05-20 09:24:15 +02:00
Sandro La Bruzzo	103e2652b3	merged beta	2024-05-17 14:43:07 +02:00
Sandro La Bruzzo	a87f9ea643	fixed scholexplorer bug	2024-05-17 14:16:43 +02:00
Sandro La Bruzzo	6efab4d88e	fixed scholexplorer bug	2024-05-16 16:19:18 +02:00
Claudio Atzori	92f018d196	[graph provision] fixed path pointing to an intermediate data store in the working directory	2024-05-15 15:39:18 +02:00
Claudio Atzori	0611c81a2f	[graph provision] using Qualifier.classNames to populate the correponsing fields in the JSON payload	2024-05-15 15:33:10 +02:00
Claudio Atzori	1efe7f7e39	[graph provision] upgrade to dhp-schema:6.1.2, included project.oamandatepublications in the JSON payload mapping, fixed serialisation of the usageCounts measures	2024-05-14 12:39:31 +02:00
Claudio Atzori	53e7bb4336	Merge pull request 'rest-collector-plugin-with-retry' (#432 ) from rest-collector-plugin-with-retry into beta Reviewed-on: D-Net/dnet-hadoop#432	2024-05-10 09:02:33 +02:00
Claudio Atzori	f7d56e2ef2	Merge branch 'beta' into rest-collector-plugin-with-retry	2024-05-10 09:02:21 +02:00
Claudio Atzori	c1237ab39e	Merge pull request 'Fixes in Graph Provision' (#434 ) from beta_provision_relation into beta Reviewed-on: D-Net/dnet-hadoop#434	2024-05-09 14:15:05 +02:00
Claudio Atzori	dc3a5858f7	Merge branch 'beta' into beta_provision_relation	2024-05-09 14:14:43 +02:00
Claudio Atzori	55f39f7850	[graph provision] adds the possibility to validate the XML records before storing them via the validateXML parameter	2024-05-09 14:06:04 +02:00
Claudio Atzori	39a2afe8b5	[graph provision] fixed XML serialization of the usage counts measures, renamed workflow actions to better reflect their role	2024-05-09 13:54:42 +02:00
Claudio Atzori	908ed9da7a	Merge pull request 'Various fixes in the stats wf' (#430 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#430	2024-05-08 13:41:02 +02:00
Antonis Lempesis	0cada3cc8f	every step is run in the analytics queue. Hardcoded for now, will make a parameter later	2024-05-08 13:42:53 +03:00
Antonis Lempesis	90a4fb3547	fixed typos	2024-05-08 13:17:58 +03:00
Claudio Atzori	18aa323ee9	cleanup unused classes, adjustments in the oozie wf definition	2024-05-08 11:36:46 +02:00
Claudio Atzori	b4e3389432	fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting	2024-05-07 16:25:17 +02:00
Giambattista Bloisi	711048ceed	PrepareRelationsJob rewritten to use Spark Dataframe API and Windowing functions	2024-05-07 15:44:33 +02:00
Sandro La Bruzzo	db358ad0d2	code formatted	2024-05-02 15:25:57 +02:00
Sandro La Bruzzo	26bf8e763a	merged from beta	2024-05-02 15:20:23 +02:00
Sandro La Bruzzo	a860c57bbc	updated .gitignore	2024-05-02 15:16:00 +02:00
Sandro La Bruzzo	0646d0d064	Updated main sparkApplication to avoid to require master variable	2024-05-02 15:15:03 +02:00
Michele Artini	f4068de298	code reindent + tests	2024-05-02 09:51:33 +02:00
Michele Artini	2615136efc	added a retry mechanism	2024-04-30 11:58:42 +02:00
Sandro La Bruzzo	133ead1e3e	updated new version of scholexplorer Generation	2024-04-29 09:00:30 +02:00
Sandro La Bruzzo	052c6aac9d	formatted code	2024-04-26 16:03:04 +02:00
Sandro La Bruzzo	9cd3bc0f10	Added a new generation of the dump for scholexplorer tested with last version of spark, and strongly refactored	2024-04-26 16:02:07 +02:00
Sandro La Bruzzo	0d628cd62b	merged again from beta	2024-04-23 17:34:55 +02:00
Lampros Smyrnaios	49af2e5740	Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views; overcome some bash-instabilities. - Upon any error, fail the whole process, not just the current DB-creation, as those errors usually indicate a bug in the initial DB-creation, that should be fixed immediately. - Enhance parallel-copy of large files by "hadoop distcp" command. - Reduce the "invalidate metadata" commands to just the current DB's tables, in order to eliminate the general overhead on Impala. - Show the number of tables and views in the logs. - Fix some log-messages.	2024-04-23 17:15:04 +03:00
Antonis Lempesis	d2649a1429	increased the jvm ram	2024-04-23 16:03:16 +03:00
Sandro La Bruzzo	073f320c6a	Added module containing all the dependencies, useful for spark deploy on k8.	2024-04-22 11:32:31 +02:00
Sandro La Bruzzo	b84ad0c06e	merged beta	2024-04-19 14:39:59 +02:00
Antonis Lempesis	b52a5a753b	Merge remote-tracking branch 'upstream/beta' into beta	2024-04-19 15:28:28 +03:00
Sandro La Bruzzo	8dd9cf84e2	code formatted	2024-04-19 12:30:59 +02:00
Sandro La Bruzzo	342cb6189b	fixed problem on changed signature on RowEncoder removed property dhp.schema.artifact	2024-04-19 12:13:26 +02:00
Antonis Lempesis	c3fe9662b2	all indicator tables are now stored as parquet	2024-04-19 12:45:36 +03:00
Antonis Lempesis	0c71c58df6	fixed the definition of gold_oa	2024-04-18 12:01:27 +03:00
Antonis Lempesis	43d05dbebb	fixed the definition of result_country	2024-04-18 11:53:50 +03:00
Antonis Lempesis	e728a0897c	fixed the definition of indi_pub_bronze_oa	2024-04-18 11:07:55 +03:00
Antonis Lempesis	308ae580a9	slight optimization in indi_pub_gold_oa definition	2024-04-18 10:57:52 +03:00
Antonis Lempesis	27d22bd8f9	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:59:52 +03:00
Antonis Lempesis	1f5aba12fa	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:54:23 +03:00
Giambattista Bloisi	613ec5ffce	Add profiles for different spark versions: spark-24, spark-34, spark-35	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	52495f2cd2	used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	8c3e9a09d3	added repository openaire-third-parties	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	2fa78f6071	Changes requires to build and run tests with Java 17	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	326c9dc08c	Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12	2023-12-05 19:11:06 +01:00