[AffiliationIngestion]refactoring

[AffiliationIngestion]Extended the ingestion of affiliation from open aire to include also links derived from Web Crawl. Extended the test. Inserted in Constatns the id and name of the webcrawl datasource to be used here and also in the ingestion of links from web crawl
2024-06-29 18:36:47 +02:00 · 2024-06-29 18:35:49 +02:00 · 2024-06-29 18:29:20 +02:00 · 2024-06-28 14:55:18 +02:00 · 2024-06-28 14:54:28 +02:00 · 2024-06-28 12:38:07 +02:00
146 changed files with 4586 additions and 2721 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
+/dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;

 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
+			mojo.execute();
+			Assertions.assertTrue(false); // not reached
+		} catch (Exception e) {
+			Assertions
+				.assertTrue(
+					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+		}
 	}

 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -70,10 +70,7 @@
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-		</dependency>
+
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@

 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>

 		<dependency>
@ -172,4 +169,23 @@
 		</dependency>
 	</dependencies>

+	<!-- dependencies required on JDK9+ because J2EE has been removed -->
+	<profiles>
+		<profile>
+			<id>spark-34</id>
+			<dependencies>
+				<dependency>
+					<groupId>javax.xml.bind</groupId>
+					<artifactId>jaxb-api</artifactId>
+					<version>2.2.11</version>
+				</dependency>
+				<dependency>
+					<groupId>com.sun.xml.ws</groupId>
+					<artifactId>jaxws-ri</artifactId>
+					<version>2.3.3</version>
+					<type>pom</type>
+				</dependency>
+			</dependencies>
+		</profile>
+	</profiles>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
 	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
 	 *            concept_rec_id = 656930
 	 * @return response code
-	 * @throws IOException
-	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
 		setDepositionId(concept_rec_id, 1);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
-import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
@ -0,0 +1,106 @@
+
+package eu.dnetlib.dhp.schema.oaf.utils;
+
+import java.util.*;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class MergeEntitiesComparator implements Comparator<Oaf> {
+	static final List<String> PID_AUTHORITIES = Arrays
+		.asList(
+			ModelConstants.ARXIV_ID,
+			ModelConstants.PUBMED_CENTRAL_ID,
+			ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
+			ModelConstants.DATACITE_ID,
+			ModelConstants.CROSSREF_ID);
+
+	static final List<String> RESULT_TYPES = Arrays
+		.asList(
+			ModelConstants.ORP_RESULTTYPE_CLASSID,
+			ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
+			ModelConstants.DATASET_RESULTTYPE_CLASSID,
+			ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
+
+	public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
+
+	@Override
+	public int compare(Oaf left, Oaf right) {
+		if (left == null && right == null)
+			return 0;
+		if (left == null)
+			return -1;
+		if (right == null)
+			return 1;
+
+		int res = 0;
+
+		// pid authority
+		int cfp1 = Optional
+			.ofNullable(left.getCollectedfrom())
+			.map(
+				cf -> cf
+					.stream()
+					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
+					.max(Integer::compare)
+					.orElse(-1))
+			.orElse(-1);
+		int cfp2 = Optional
+			.ofNullable(right.getCollectedfrom())
+			.map(
+				cf -> cf
+					.stream()
+					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
+					.max(Integer::compare)
+					.orElse(-1))
+			.orElse(-1);
+
+		if (cfp1 >= 0 && cfp1 > cfp2) {
+			return 1;
+		} else if (cfp2 >= 0 && cfp2 > cfp1) {
+			return -1;
+		}
+
+		// trust
+		if (left.getDataInfo() != null && right.getDataInfo() != null) {
+			res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
+		}
+
+		// result type
+		if (res == 0) {
+			if (left instanceof Result && right instanceof Result) {
+				Result r1 = (Result) left;
+				Result r2 = (Result) right;
+
+				if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
+					if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
+						return -1;
+					}
+				} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
+					return 1;
+				}
+
+				int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
+				int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
+
+				if (rt1 >= 0 && rt1 > rt2) {
+					return 1;
+				} else if (rt2 >= 0 && rt2 > rt1) {
+					return -1;
+				}
+			}
+		}
+
+		// id
+		if (res == 0) {
+			if (left instanceof OafEntity && right instanceof OafEntity) {
+				res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
+			}
+		}
+
+		return res;
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -40,27 +40,12 @@ public class MergeUtils {

 	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
 		boolean checkDelegateAuthority) {
-		TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
-			int res = 0;

-			if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
-				res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
-			}
+		ArrayList<T> sortedEntities = new ArrayList<>();
+		oafEntityIterator.forEachRemaining(sortedEntities::add);
+		sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());

-			if (res == 0) {
-				if (o1 instanceof Result && o2 instanceof Result) {
-					return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
-				}
-			}
-
-			return res;
-		});
-
-		while (oafEntityIterator.hasNext()) {
-			sortedEntities.add(oafEntityIterator.next());
-		}
-
-		Iterator<T> it = sortedEntities.descendingIterator();
+		Iterator<T> it = sortedEntities.iterator();
 		T merged = it.next();

 		while (it.hasNext()) {
@ -143,7 +128,7 @@ public class MergeUtils {
 	 * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
 	 * such version.
 	 * <p>
-	 * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
+	 * Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
 	 * and proceeds with the canonical property merging.
 	 *
 	 * @param left
@ -161,8 +146,9 @@ public class MergeUtils {
 		if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
 			return right;
 		}
+
 		// TODO: raise trust to have preferred fields from one or the other??
-		if (new ResultTypeComparator().compare(left, right) < 0) {
+		if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
 			return mergeResultFields(left, right);
 		} else {
 			return mergeResultFields(right, left);
@ -225,9 +211,9 @@ public class MergeUtils {

 	private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
 		Function<T, K> keyExtractor, BinaryOperator<T> merger) {
-		if (left == null) {
-			return right;
-		} else if (right == null) {
+		if (left == null || left.isEmpty()) {
+			return right != null ? right : new ArrayList<>();
+		} else if (right == null || right.isEmpty()) {
 			return left;
 		}

@ -405,7 +391,7 @@ public class MergeUtils {
 		}

 		// should be an instance attribute, get the first non-null value
-		merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
+		merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));

 		// distinct countries, do not manage datainfo
 		merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
@ -575,6 +561,13 @@ public class MergeUtils {
 		return m != null ? m : e;
 	}

+	private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
+		if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
+			return e;
+		}
+		return m;
+	}
+
 	private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
 		List<List<Author>> authors = new ArrayList<>();
 		if (author != null) {
@ -587,6 +580,10 @@ public class MergeUtils {
 	}

 	private static String instanceKeyExtractor(Instance i) {
+		// three levels of concatenating:
+		// 1. ::
+		// 2. @@
+		// 3. ||
 		return String
 			.join(
 				"::",
@ -594,10 +591,10 @@ public class MergeUtils {
 				kvKeyExtractor(i.getCollectedfrom()),
 				qualifierKeyExtractor(i.getAccessright()),
 				qualifierKeyExtractor(i.getInstancetype()),
-				Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
+				Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
 				Optional
 					.ofNullable(i.getPid())
-					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
+					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
 					.orElse(null));
 	}

@ -706,7 +703,7 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
+			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
@ -1,87 +0,0 @@
-
-package eu.dnetlib.dhp.schema.oaf.utils;
-
-import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
-
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.Optional;
-import java.util.stream.Collectors;
-
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class ResultTypeComparator implements Comparator<Result> {
-
-	public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
-
-	@Override
-	public int compare(Result left, Result right) {
-
-		if (left == null && right == null)
-			return 0;
-		if (left == null)
-			return 1;
-		if (right == null)
-			return -1;
-
-		HashSet<String> lCf = getCollectedFromIds(left);
-		HashSet<String> rCf = getCollectedFromIds(right);
-
-		if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
-			return -1;
-		}
-		if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
-			return 1;
-		}
-
-		if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
-			if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
-				return 0;
-			}
-			return 1;
-		} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
-			return -1;
-		}
-
-		String lClass = left.getResulttype().getClassid();
-		String rClass = right.getResulttype().getClassid();
-
-		if (!lClass.equals(rClass)) {
-			if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
-				return 1;
-		}
-
-		// Else (but unlikely), lexicographical ordering will do.
-		return lClass.compareTo(rClass);
-	}
-
-	protected HashSet<String> getCollectedFromIds(Result left) {
-		return Optional
-			.ofNullable(left.getCollectedfrom())
-			.map(
-				cf -> cf
-					.stream()
-					.map(KeyValue::getKey)
-					.collect(Collectors.toCollection(HashSet::new)))
-			.orElse(new HashSet<>());
-	}
-}
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
-      .getOrCreate()
+    if (master != null)
+      b.master(master)
+    b.getOrCreate()
  }

  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }

  def generateScholixResourceFromResult(r: Result): ScholixResource = {
-    generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    val sum = ScholixUtils.resultToSummary(r)
+    if (sum != null)
+      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    else
+      null
  }

  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {

  }

+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
-      s.setTypology(Typology.publication)
-    else
-      s.setTypology(Typology.dataset)
+//    s.setTypology(r.getResulttype.getClassid)

    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -59,14 +59,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>com.google.guava</groupId>
-			<artifactId>guava</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.google.code.gson</groupId>
-			<artifactId>gson</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.commons</groupId>
-			<artifactId>commons-math3</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
 		</dependency>
 	</dependencies>

+	<profiles>
+		<profile>
+			<id>spark-24</id>
+			<activation>
+				<activeByDefault>true</activeByDefault>
+			</activation>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-34</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-35</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-35</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+	</profiles>
+
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -1,12 +1,6 @@

 package eu.dnetlib.pace.common;

-import com.google.common.base.Joiner;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
 /**
 * Set of common functions for the framework
 *
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)

  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }

  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    RowEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    ExpressionEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;

 public class UtilTest {

--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <parent>
+    <artifactId>dhp</artifactId>
+    <groupId>eu.dnetlib.dhp</groupId>
+    <version>1.2.5-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <artifactId>dhp-shade-package</artifactId>
+  <description>This module create a jar of all module dependencies</description>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <transformers>
+                <transformer>
+                  <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                </transformer>
+                <transformer />
+                <transformer>
+                  <resource>META-INF/cxf/bus-extensions.txt</resource>
+                </transformer>
+              </transformers>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/maven/**</exclude>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <relocations>
+                <relocation>
+                  <pattern>com</pattern>
+                  <shadedPattern>repackaged.com.google.common</shadedPattern>
+                  <includes>
+                    <include>com.google.common.**</include>
+                  </includes>
+                </relocation>
+              </relocations>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.projectlombok</groupId>
+      <artifactId>lombok</artifactId>
+      <version>1.18.28</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter</artifactId>
+      <version>5.6.1</version>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>junit-jupiter-api</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>junit-jupiter-params</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>junit-jupiter-engine</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>3.3.3</version>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>byte-buddy</artifactId>
+          <groupId>net.bytebuddy</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>byte-buddy-agent</artifactId>
+          <groupId>net.bytebuddy</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-junit-jupiter</artifactId>
+      <version>3.3.3</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <distributionManagement>
+    <site>
+      <id>DHPSite</id>
+      <url>${dhp.site.stage.path}/dhp-common</url>
+    </site>
+  </distributionManagement>
+</project>
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>dhp</artifactId>
+        <version>1.2.5-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+
+    </parent>
+
+    <artifactId>dhp-shade-package</artifactId>
+    <packaging>jar</packaging>
+
+    <distributionManagement>
+        <site>
+            <id>DHPSite</id>
+            <url>${dhp.site.stage.path}/dhp-common</url>
+        </site>
+    </distributionManagement>
+
+    <description>This module create a jar of all module dependencies</description>
+
+
+    <dependencies>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-actionmanager</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-blacklist</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-broker-events</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-dedup-openaire</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-enrichment</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-mapper</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-provision</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-impact-indicators</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-actionsets</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-hist-snaps</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-monitor-irish</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-promote</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-swh</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-raw-data-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-stats-build</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <transformers>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                                </transformer>
+                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/maven/**</exclude>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com</pattern>
+                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
+                                    <includes>
+                                        <include>com.google.common.**</include>
+                                    </includes>
+                                </relocation>
+                            </relocations>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,6 +157,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -125,6 +125,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -103,11 +103,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -156,11 +157,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -95,11 +95,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=10000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java
@ -42,6 +42,9 @@ public class Constants {
 	public static final String NULL = "NULL";
 	public static final String NA = "N/A";

+	public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
+	public static final String WEB_CRAWL_NAME = "Web Crawl";
+
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

 	private Constants() {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -9,6 +9,7 @@ import java.util.List;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -40,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String ID_PREFIX = "50|doi_________::";
-	public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
-	public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
-	public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
+	public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
+	public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
+	public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";

 	public static <I extends Result> void main(String[] args) throws Exception {

@ -70,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
 		final String dataciteInputPath = parser.get("dataciteInputPath");
 		log.info("dataciteInputPath: {}", dataciteInputPath);

+		final String webcrawlInputPath = parser.get("webCrawlInputPath");
+		log.info("webcrawlInputPath: {}", webcrawlInputPath);
+
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

@ -101,12 +105,18 @@ public class PrepareAffiliationRelations implements Serializable {
 				JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
 					spark, dataciteInputPath, collectedFromDatacite);

+				List<KeyValue> collectedFromWebCrawl = OafMapperUtils
+					.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
+				JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
+					spark, webcrawlInputPath, collectedFromWebCrawl);
+
 				crossrefRelations
 					.union(pubmedRelations)
 					.union(openAPCRelations)
 					.union(dataciteRelations)
+					.union(webCrawlRelations)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);

 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -115,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
 			.forEach(
 				l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
 		r.setSubject(sbjs);
-		r
-			.setDataInfo(
-				OafMapperUtils
-					.dataInfo(
-						false, null, true,
-						false,
-						OafMapperUtils
-							.qualifier(
-								ModelConstants.PROVENANCE_ENRICH,
-								null,
-								ModelConstants.DNET_PROVENANCE_ACTIONS,
-								ModelConstants.DNET_PROVENANCE_ACTIONS),
-						null));
+
 		return r;
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
 						s -> sbjs
 							.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
 				r.setSubject(sbjs);
-				r
-					.setDataInfo(
-						OafMapperUtils
-							.dataInfo(
-								false, null, true,
-								false,
-								OafMapperUtils
-									.qualifier(
-										ModelConstants.PROVENANCE_ENRICH,
-										null,
-										ModelConstants.DNET_PROVENANCE_ACTIONS,
-										ModelConstants.DNET_PROVENANCE_ACTIONS),
-								null));
+
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.types.StructType;
@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

+import eu.dnetlib.dhp.actionmanager.Constants;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -28,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
 import eu.dnetlib.dhp.schema.oaf.utils.PidType;
+import io.netty.util.Constant;
 import scala.Tuple2;

 /**
@ -43,8 +46,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
 	private static final String PMID_PREFIX = "50|pmid________::";

 	private static final String PMCID_PREFIX = "50|pmc_________::";
-	private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
-	private static final String WEB_CRAWL_NAME = "Web Crawl";
+
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

 	public static void main(String[] args) throws Exception {
@ -70,6 +72,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

+		final String blackListInputPath = parser.get("blackListPath");
+		log.info("blackListInputPath: {}", blackListInputPath);
+
 		SparkConf conf = new SparkConf();

 		runWithSparkSession(
@ -77,29 +82,35 @@ public class CreateActionSetFromWebEntries implements Serializable {
 			isSparkSessionManaged,
 			spark -> {

-				createActionSet(spark, inputPath, outputPath);
+				createActionSet(spark, inputPath, outputPath, blackListInputPath);

 			});
 	}

 	public static void createActionSet(SparkSession spark, String inputPath,
-		String outputPath) {
+		String outputPath, String blackListInputPath) {

 		final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
-			.filter("publication_year <= 2020 or country_code=='IE'")
+			.filter("country_code=='IE'")
 			.drop("publication_year");

-		dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
-			List<Relation> ret = new ArrayList<>();
-			final String ror = ROR_PREFIX
-				+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
-			ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
-			ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
-			ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);

-			return ret
-				.iterator();
-		}, Encoders.bean(Relation.class))
+		dataset
+			.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
+			.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
+			.drop("OpenAlexId")
+			.flatMap((FlatMapFunction<Row, Relation>) row -> {
+				List<Relation> ret = new ArrayList<>();
+				final String ror = ROR_PREFIX
+					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
+				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
+				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+
+				return ret
+					.iterator();
+			}, Encoders.bean(Relation.class))
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.mapToPair(
@ -136,6 +147,15 @@ public class CreateActionSetFromWebEntries implements Serializable {

 	}

+	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
+
+		return spark
+			.read()
+			.option("header", true)
+			.csv(inputPath)
+			.select("OpenAlexId");
+	}
+
 	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
 		if (pmcid == null)
 			return new ArrayList<>();
@ -195,7 +215,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
 						ModelConstants.IS_AUTHOR_INSTITUTION_OF,
 						Arrays
 							.asList(
-								OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
+								OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
 						OafMapperUtils
 							.dataInfo(
 								false, null, false, false,
@ -214,7 +234,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
 						ModelConstants.HAS_AUTHOR_INSTITUTION,
 						Arrays
 							.asList(
-								OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
+								OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
 						OafMapperUtils
 							.dataInfo(
 								false, null, false, false,
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -1,6 +1,7 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;

 import org.apache.commons.lang3.StringUtils;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
+		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+		Gson gson = new Gson();
+		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
-		if (StringUtils.isBlank(queryParams)) {
-			throw new CollectorException("Param 'queryParams' is null or empty");
-		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
+			requestHeaders);

 		return StreamSupport
 			.stream(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -9,8 +9,11 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@ -22,20 +25,20 @@ import javax.xml.xpath.*;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.http.HttpHeaders;
-import org.apache.http.entity.ContentType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;

+import com.google.common.collect.Maps;
+
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;

 /**
- * log.info(...) equal to  log.trace(...) in the application-logs
+ * log.info(...) equal to log.trace(...) in the application-logs
 * <p>
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
 *
@ -44,24 +47,29 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 *
 */
 public class RestIterator implements Iterator<String> {
-
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
+	private static final int MAX_ATTEMPTS = 5;

 	private final HttpClientParams clientParams;

-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
+
+	private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+	private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
+		+ ">";

 	private final String baseUrl;
 	private final String resumptionType;
 	private final String resumptionParam;
 	private final String resultFormatValue;
-	private String queryParams;
+	private String queryParams = "";
 	private final int resultSizeValue;
 	private int resumptionInt = 0; // integer resumption token (first record to harvest)
 	private int resultTotal = -1;
-	private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
-																	// or token scanned from results)
+	private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
+																			// harvest
+	// or token scanned from results)
 	private InputStream resultStream;
 	private Transformer transformer;
 	private XPath xpath;
@ -73,7 +81,7 @@ public class RestIterator implements Iterator<String> {
 	private final String querySize;
 	private final String authMethod;
 	private final String authToken;
-	private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
+	private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
 	private int discoverResultSize = 0;
 	private int pagination = 1;
 	/*
@ -83,8 +91,13 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;

-	/** RestIterator class
-	 *  compatible to version 1.3.33
+	/*
+	 * Can be used to set additional request headers, like for content negotiation
+	 */
+	private Map<String, String> requestHeaders;
+
+	/**
+	 * RestIterator class compatible to version 1.3.33
 	 */
 	public RestIterator(
 		final HttpClientParams clientParams,
@ -101,47 +114,56 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
+		final Map<String, String> requestHeaders) {

 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
 		this.resumptionType = resumptionType;
 		this.resumptionParam = resumptionParam;
 		this.resultFormatValue = resultFormatValue;
-		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+		this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
 		this.queryParams = queryParams;
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
+		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();

-		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+			: "";
+		this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
 			: "";
-		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";

 		try {
 			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
-		} catch (Exception e) {
+		} catch (final Exception e) {
 			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
 		}

 		initQueue();
 	}

-	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+	private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
+		final String entityXpath)
 		throws TransformerConfigurationException, XPathExpressionException {
 		final TransformerFactory factory = TransformerFactory.newInstance();
-		transformer = factory.newTransformer();
-		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
-		xpath = XPathFactory.newInstance().newXPath();
-		xprResultTotalPath = xpath.compile(resultTotalXpath);
-		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
-		xprEntity = xpath.compile(entityXpath);
+		this.transformer = factory.newTransformer();
+		this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+		this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+		this.xpath = XPathFactory.newInstance().newXPath();
+		this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
+		this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+		this.xprEntity = this.xpath.compile(entityXpath);
 	}

 	private void initQueue() {
-		query = baseUrl + "?" + queryParams + querySize + queryFormat;
-		log.info("REST calls starting with {}", query);
+		if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
+			query = baseUrl;
+		} else {
+			query = baseUrl + "?" + queryParams + querySize + queryFormat;
+		}
+
+		log.info("REST calls starting with {}", this.query);
 	}

 	private void disconnect() {
@ -154,11 +176,22 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public boolean hasNext() {
-		if (recordQueue.isEmpty() && query.isEmpty()) {
+		synchronized (this.recordQueue) {
+			while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
+				try {
+					this.query = downloadPage(this.query, 0);
+				} catch (final CollectorException e) {
+					log.debug("CollectorPlugin.next()-Exception: {}", e);
+					throw new RuntimeException(e);
+				}
+			}
+
+			if (!this.recordQueue.isEmpty()) {
+				return true;
+			}
+
 			disconnect();
 			return false;
-		} else {
-			return true;
 		}
 	}

@ -168,214 +201,248 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public String next() {
-		synchronized (recordQueue) {
-			while (recordQueue.isEmpty() && !query.isEmpty()) {
-				try {
-					query = downloadPage(query);
-				} catch (CollectorException e) {
-					log.debug("CollectorPlugin.next()-Exception: {}", e);
-					throw new RuntimeException(e);
-				}
-			}
-			return recordQueue.poll();
+		synchronized (this.recordQueue) {
+			return this.recordQueue.poll();
 		}
 	}

 	/*
-	 * download page and return nextQuery
+	 * download page and return nextQuery (with number of attempt)
 	 */
-	private String downloadPage(String query) throws CollectorException {
-		String resultJson;
-		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
-		String nextQuery = "";
-		String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
-		Node resultNode = null;
-		NodeList nodeList = null;
-		String qUrlArgument = "";
-		int urlOldResumptionSize = 0;
-		InputStream theHttpInputStream;
+	private String downloadPage(String query, final int attempt) throws CollectorException {

-		// check if cursor=* is initial set otherwise add it to the queryParam URL
-		if (resumptionType.equalsIgnoreCase("deep-cursor")) {
-			log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
-			if (!query.contains("&cursor=")) {
-				query += "&cursor=*";
+		if (attempt > MAX_ATTEMPTS) {
+			throw new CollectorException("Max Number of attempts reached, query:" + query);
+		}
+
+		if (attempt > 0) {
+			final int delay = (attempt * 5000);
+			log.debug("Attempt {} with delay {}", attempt, delay);
+			try {
+				Thread.sleep(delay);
+			} catch (final InterruptedException e) {
+				new CollectorException(e);
 			}
 		}

 		try {
-			log.info("requestig URL [{}]", query);
+			String resultJson;
+			String resultXml = XML_HEADER;
+			String nextQuery = "";
+			Node resultNode = null;
+			NodeList nodeList = null;
+			String qUrlArgument = "";
+			int urlOldResumptionSize = 0;
+			InputStream theHttpInputStream;

-			URL qUrl = new URL(query);
-			log.debug("authMethod: {}", authMethod);
-			if ("bearer".equalsIgnoreCase(this.authMethod)) {
-				log.trace("authMethod before inputStream: {}", resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
-				conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
-				conn.setRequestMethod("GET");
-				theHttpInputStream = conn.getInputStream();
-			} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
-				log.trace("authMethod before inputStream: {}", resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
-				conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
-				conn.setRequestMethod("GET");
-				theHttpInputStream = conn.getInputStream();
-			} else {
-				theHttpInputStream = qUrl.openStream();
-			}
-
-			resultStream = theHttpInputStream;
-			if ("json".equals(resultOutputFormat)) {
-				resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
-				resultXml = JsonUtils.convertToXML(resultJson);
-				resultStream = IOUtils.toInputStream(resultXml, UTF_8);
-			}
-
-			if (!(emptyXml).equalsIgnoreCase(resultXml)) {
-				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
-				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
-				log.debug("nodeList.length: {}", nodeList.getLength());
-				for (int i = 0; i < nodeList.getLength(); i++) {
-					StringWriter sw = new StringWriter();
-					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
-					String toEnqueue = sw.toString();
-					if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
-						log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
-					} else {
-						recordQueue.add(sw.toString());
-					}
+			// check if cursor=* is initial set otherwise add it to the queryParam URL
+			if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
+				log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
+				if (!query.contains("&cursor=")) {
+					query += "&cursor=*";
 				}
-			} else {
-				log.warn("resultXml is equal with emptyXml");
 			}

-			resumptionInt += resultSizeValue;
+			// find pagination page start number in queryParam and remove before start the first query
+			if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
+				&& (query.contains("paginationStart="))) {

-			switch (resumptionType.toLowerCase()) {
-				case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
-					resumptionStr = xprResumptionPath.evaluate(resultNode);
-					break;
+				final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
+				m.find(); // guaranteed to be true for this regex

-				case "count": // begin at one step for all records, iterate over items
-					resumptionStr = Integer.toString(resumptionInt);
-					break;
+				String[] pageVal = m.group(0).split("=");
+				pagination = Integer.parseInt(pageVal[1]);

-				case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
-					if (resultSizeValue < 2) {
-						throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
+				// remove page start number from query and queryParams
+				queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
+				query = query.replaceFirst("&?paginationStart=[0-9]+", "");
+
+			}
+
+			try {
+				log.info("requesting URL [{}]", query);
+
+				final URL qUrl = new URL(query);
+				log.debug("authMethod: {}", this.authMethod);
+				if (this.authMethod == "bearer") {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Bearer " + authToken);
+					// requestHeaders.put("Content-Type", "application/json");
+				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Basic " + authToken);
+					// requestHeaders.put("accept", "application/xml");
+				}
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestMethod("GET");
+				this.setRequestHeader(conn);
+				resultStream = conn.getInputStream();
+
+				if ("json".equals(this.resultOutputFormat)) {
+					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
+					resultXml = JsonUtils.convertToXML(resultJson);
+					this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
+				}
+
+				if (!isEmptyXml(resultXml)) {
+					resultNode = (Node) this.xpath
+						.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
+					nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
+					log.debug("nodeList.length: {}", nodeList.getLength());
+					for (int i = 0; i < nodeList.getLength(); i++) {
+						final StringWriter sw = new StringWriter();
+						this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
+						final String toEnqueue = sw.toString();
+						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
+							log
+								.warn(
+									"The following record resulted in empty item for the feeding queue: {}", resultXml);
+						} else {
+							this.recordQueue.add(sw.toString());
+						}
 					}
-					qUrlArgument = qUrl.getQuery();
-					String[] arrayQUrlArgument = qUrlArgument.split("&");
-					for (String arrayUrlArgStr : arrayQUrlArgument) {
-						if (arrayUrlArgStr.startsWith(resumptionParam)) {
-							String[] resumptionKeyValue = arrayUrlArgStr.split("=");
-							if (isInteger(resumptionKeyValue[1])) {
-								urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
-								log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
-							} else {
-								log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
+				} else {
+					log.warn("resultXml is equal with emptyXml");
+				}
+
+				this.resumptionInt += this.resultSizeValue;
+
+				switch (this.resumptionType.toLowerCase()) {
+					case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
+						this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
+						break;
+
+					case "count": // begin at one step for all records, iterate over items
+						this.resumptionStr = Integer.toString(this.resumptionInt);
+						break;
+
+					case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
+						if (this.resultSizeValue < 2) {
+							throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
+						}
+						qUrlArgument = qUrl.getQuery();
+
+						final String[] arrayQUrlArgument = qUrlArgument.split("&");
+						for (final String arrayUrlArgStr : arrayQUrlArgument) {
+							if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
+								final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+								if (isInteger(resumptionKeyValue[1])) {
+									urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
+									log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
+								} else {
+									log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
+								}
 							}
 						}
-					}

-					if (((emptyXml).equalsIgnoreCase(resultXml))
-						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
-						// resumptionStr = "";
-						if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+						if (isEmptyXml(resultXml)
+							|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
+							// resumptionStr = "";
+							if (nodeList != null) {
+								this.discoverResultSize += nodeList.getLength();
+							}
+							this.resultTotal = this.discoverResultSize;
+						} else {
+							this.resumptionStr = Integer.toString(this.resumptionInt);
+							this.resultTotal = this.resumptionInt + 1;
+							if (nodeList != null) {
+								this.discoverResultSize += nodeList.getLength();
+							}
 						}
-						resultTotal = discoverResultSize;
-					} else {
-						resumptionStr = Integer.toString(resumptionInt);
-						resultTotal = resumptionInt + 1;
-						if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+						log.info("discoverResultSize: {}", this.discoverResultSize);
+						break;
+
+					case "pagination":
+					case "page": // pagination, iterate over page numbers
+						if (nodeList != null && nodeList.getLength() > 0) {
+							this.discoverResultSize += nodeList.getLength();
+						} else {
+							this.resultTotal = this.discoverResultSize;
+							this.pagination = this.discoverResultSize;
 						}
-					}
-					log.info("discoverResultSize: {}", discoverResultSize);
-					break;
+						this.pagination += 1;
+						this.resumptionInt = this.pagination;
+						this.resumptionStr = Integer.toString(this.resumptionInt);
+						break;

-				case "pagination":
-				case "page": // pagination, iterate over page numbers
-					pagination += 1;
-					if (nodeList != null) {
-						discoverResultSize += nodeList.getLength();
-					} else {
-						resultTotal = discoverResultSize;
-						pagination = discoverResultSize;
-					}
-					resumptionInt = pagination;
-					resumptionStr = Integer.toString(resumptionInt);
-					break;
+					case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
+										// in
+										// solr)
+						// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
+						// deep-cursor, Param 'resultSizeValue' is less than 2");}

-				case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
-									// solr)
-					// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
-					// deep-cursor, Param 'resultSizeValue' is less than 2");}
+						this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
+						this.queryParams = this.queryParams.replace("&cursor=*", "");

-					resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
-					queryParams = queryParams.replace("&cursor=*", "");
+						// terminating if length of nodeList is 0
+						if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
+							this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
+						} else {
+							this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
+																									// resultSizeValue
+							// because the iteration is over
+							// real length and the
+							// resultSizeValue is added before
+							// the switch()
+						}

-					// terminating if length of nodeList is 0
-					if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
-						resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
-					} else {
-						resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
-																					// because the iteration is over
-																					// real length and the
-																					// resultSizeValue is added before
-																					// the switch()
-					}
+						this.discoverResultSize = nodeList.getLength();

-					discoverResultSize = nodeList.getLength();
+						log
+							.debug(
+								"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
+									+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);

-					log
-						.debug(
-							"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
-								+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
+						break;

-					break;
+					default: // otherwise: abort
+						// resultTotal = resumptionInt;
+						break;
+				}

-				default: // otherwise: abort
-					// resultTotal = resumptionInt;
-					break;
+			} catch (final Exception e) {
+				log.error(e.getMessage(), e);
+				throw new IllegalStateException("collection failed: " + e.getMessage());
 			}

-		} catch (Exception e) {
-			log.error(e.getMessage(), e);
-			throw new IllegalStateException("collection failed: " + e.getMessage());
-		}
-
-		try {
-			if (resultTotal == -1) {
-				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
-				if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
-					resultTotal += 1;
-				} // to correct the upper bound
-				log.info("resultTotal was -1 is now: " + resultTotal);
+			try {
+				if (this.resultTotal == -1) {
+					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
+					if ("page".equalsIgnoreCase(this.resumptionType)
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+						this.resultTotal += 1;
+					} // to correct the upper bound
+					log.info("resultTotal was -1 is now: " + this.resultTotal);
+				}
+			} catch (final Exception e) {
+				log.error(e.getMessage(), e);
+				throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
 			}
-		} catch (Exception e) {
-			log.error(e.getMessage(), e);
-			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
+			log.debug("resultTotal: " + this.resultTotal);
+			log.debug("resInt: " + this.resumptionInt);
+			if (this.resumptionInt <= this.resultTotal) {
+				nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
+					+ this.resumptionStr
+					+ this.queryFormat;
+			} else {
+				nextQuery = "";
+				// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
+				// resumptionInt and prevent a NullPointer Exception at mdStore
+			}
+			log.debug("nextQueryUrl: " + nextQuery);
+			return nextQuery;
+		} catch (final Throwable e) {
+			log.warn(e.getMessage(), e);
+			return downloadPage(query, attempt + 1);
 		}
-		log.debug("resultTotal: " + resultTotal);
-		log.debug("resInt: " + resumptionInt);
-		if (resumptionInt <= resultTotal) {
-			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
-				+ queryFormat;
-		} else {
-			nextQuery = "";
-			// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
-			// resumptionInt and prevent a NullPointer Exception at mdStore
-		}
-		log.debug("nextQueryUrl: " + nextQuery);
-		return nextQuery;

 	}

-	private boolean isInteger(String s) {
+	private boolean isEmptyXml(String s) {
+		return EMPTY_XML.equalsIgnoreCase(s);
+	}
+
+	private boolean isInteger(final String s) {
 		boolean isValidInteger = false;
 		try {
 			Integer.parseInt(s);
@ -383,7 +450,7 @@ public class RestIterator implements Iterator<String> {
 			// s is a valid integer

 			isValidInteger = true;
-		} catch (NumberFormatException ex) {
+		} catch (final NumberFormatException ex) {
 			// s is not an integer
 		}

@ -391,20 +458,36 @@ public class RestIterator implements Iterator<String> {
 	}

 	// Method to encode a string value using `UTF-8` encoding scheme
-	private String encodeValue(String value) {
+	private String encodeValue(final String value) {
 		try {
 			return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
-		} catch (UnsupportedEncodingException ex) {
+		} catch (final UnsupportedEncodingException ex) {
 			throw new RuntimeException(ex.getCause());
 		}
 	}

+	/**
+	 * setRequestHeader
+	 *
+	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+	 * @param conn
+	 */
+	private void setRequestHeader(HttpURLConnection conn) {
+		if (requestHeaders != null) {
+			for (String key : requestHeaders.keySet()) {
+				conn.setRequestProperty(key, requestHeaders.get(key));
+			}
+			log.debug("Set Request Header with: " + requestHeaders);
+		}
+
+	}
+
 	public String getResultFormatValue() {
-		return resultFormatValue;
+		return this.resultFormatValue;
 	}

 	public String getResultOutputFormat() {
-		return resultOutputFormat;
+		return this.resultOutputFormat;
 	}

 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -8,7 +8,10 @@ import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
 import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;

 import javax.xml.stream.XMLEventFactory;
 import javax.xml.stream.XMLEventReader;
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.StartElement;
 import javax.xml.stream.events.XMLEvent;

+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {

 	private String element;

+	private List<String> elements;
+
 	private InputStream inputStream;

 	public XMLIterator(final String element, final InputStream inputStream) {
 		super();
 		this.element = element;
+		if (element.contains(",")) {
+			elements = Arrays
+				.stream(element.split(","))
+				.filter(StringUtils::isNoneBlank)
+				.map(String::toLowerCase)
+				.collect(Collectors.toList());
+		}
 		this.inputStream = inputStream;
 		this.parser = getParser();
+
 		try {
 			this.current = findElement(parser);
 		} catch (XMLStreamException e) {
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
 				final XMLEvent event = parser.nextEvent();

 				// TODO: replace with depth tracking instead of close tag tracking.
-				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+				if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
 					writer.add(event);
 					break;
 				}
@ -142,18 +156,16 @@ public class XMLIterator implements Iterator<String> {
 		XMLEvent peek = parser.peek();
 		if (peek != null && peek.isStartElement()) {
 			String name = peek.asStartElement().getName().getLocalPart();
-			if (element.equals(name)) {
+			if (isCheckTag(name))
 				return peek;
-			}
 		}

 		while (parser.hasNext()) {
-			final XMLEvent event = parser.nextEvent();
+			XMLEvent event = parser.nextEvent();
 			if (event != null && event.isStartElement()) {
 				String name = event.asStartElement().getName().getLocalPart();
-				if (element.equals(name)) {
+				if (isCheckTag(name))
 					return event;
-				}
 			}
 		}
 		return null;
@ -161,12 +173,31 @@ public class XMLIterator implements Iterator<String> {

 	private XMLEventReader getParser() {
 		try {
-			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+			XMLInputFactory xif = inputFactory.get();
+			xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+			return xif.createXMLEventReader(sanitize(inputStream));
 		} catch (XMLStreamException e) {
 			throw new RuntimeException(e);
 		}
 	}

+	private boolean isCheckTag(final String tagName) {
+		if (elements != null) {
+			final String found = elements
+				.stream()
+				.filter(e -> e.equalsIgnoreCase(tagName))
+				.findFirst()
+				.orElse(null);
+			if (found != null)
+				return true;
+		} else {
+			if (element.equalsIgnoreCase(tagName)) {
+				return true;
+			}
+		}
+		return false;
+	}
+
 	private Reader sanitize(final InputStream in) {
 		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
 		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
@ -28,7 +28,13 @@
    "paramLongName": "dataciteInputPath",
    "paramDescription": "the path to get the input data from Datacite",
    "paramRequired": true
-  },
+  },{
+  "paramName": "wip",
+  "paramLongName": "webCrawlInputPath",
+  "paramDescription": "the path to get the input data from Web Crawl",
+  "paramRequired": true
+}
+,
  {
    "paramName": "o",
    "paramLongName": "outputPath",
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
@ -17,6 +17,10 @@
            <name>dataciteInputPath</name>
            <description>the path where to find the inferred affiliation relations from Datacite</description>
        </property>
+        <property>
+            <name>webCrawlInputPath</name>
+            <description>the path where to find the inferred affiliation relations from webCrawl</description>
+        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
@ -112,7 +116,7 @@
            <arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
            <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
            <arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
-
+            <arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@ -16,5 +16,10 @@
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
-  }
+  },{
+  "paramName": "bl",
+  "paramLongName": "blackListPath",
+  "paramDescription": "the working path",
+  "paramRequired": true
+}
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@ -1,2 +1,3 @@
 sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
 outputPath=/tmp/miriam/webcrawlComplete/
+blackListPath=/user/miriam.baglioni/openalex-blackList
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@ -45,6 +45,7 @@
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--blackListPath</arg><arg>${blackListPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -58,7 +58,7 @@
    "uri": "http://dx.doi.org/10.13039/100010414",
    "name": "Health Research Board",
    "synonym": [
-      "501100001590"
+      "501100001590", "501100023273"
    ]
  },
  {
@ -85,24 +85,6 @@
    "name": "Irish College of General Practitioners",
    "synonym": []
  },
-  {
-    "id": "100012734",
-    "uri": "http://dx.doi.org/10.13039/100012734",
-    "name": "Department for Culture, Heritage and the Gaeltacht, Ireland",
-    "synonym": []
-  },
-  {
-    "id": "100012754",
-    "uri": "http://dx.doi.org/10.13039/100012754",
-    "name": "Horizon Pharma",
-    "synonym": []
-  },
-  {
-    "id": "100012891",
-    "uri": "http://dx.doi.org/10.13039/100012891",
-    "name": "Medical Research Charities Group",
-    "synonym": []
-  },
  {
    "id": "100012919",
    "uri": "http://dx.doi.org/10.13039/100012919",
@ -233,7 +215,7 @@
    "id": "100018064",
    "uri": "http://dx.doi.org/10.13039/100018064",
    "name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
-    "synonym": []
+    "synonym": ["100012734"]
  },
  {
    "id": "100018172",
@ -281,13 +263,13 @@
    "id": "100019637",
    "uri": "http://dx.doi.org/10.13039/100019637",
    "name": "Horizon Therapeutics",
-    "synonym": []
+    "synonym": ["100012754"]
  },
  {
    "id": "100020174",
    "uri": "http://dx.doi.org/10.13039/100020174",
    "name": "Health Research Charities Ireland",
-    "synonym": []
+    "synonym": ["100012891"]
  },
  {
    "id": "100020202",
@ -319,12 +301,7 @@
    "name": "Centre for Ageing Research and Development in Ireland",
    "synonym": []
  },
-  {
-    "id": "501100001583",
-    "uri": "http://dx.doi.org/10.13039/501100001583",
-    "name": "Cystinosis Foundation Ireland",
-    "synonym": []
-  },
+
  {
    "id": "501100001584",
    "uri": "http://dx.doi.org/10.13039/501100001584",
@ -521,7 +498,7 @@
    "id": "501100003037",
    "uri": "http://dx.doi.org/10.13039/501100003037",
    "name": "Elan",
-    "synonym": []
+    "synonym": ["501100021694"]
  },
  {
    "id": "501100003496",
@ -595,17 +572,11 @@
    "name": "Technological University Dublin",
    "synonym": []
  },
-  {
-    "id": "501100009269",
-    "uri": "http://dx.doi.org/10.13039/501100009269",
-    "name": "Programme of Competitive Forestry Research for Development",
-    "synonym": []
-  },
  {
    "id": "501100009315",
    "uri": "http://dx.doi.org/10.13039/501100009315",
    "name": "Cystinosis Ireland",
-    "synonym": []
+    "synonym": ["501100001583"]
  },
  {
    "id": "501100010808",
@ -625,12 +596,6 @@
    "name": "Alimentary Health",
    "synonym": []
  },
-  {
-    "id": "501100011103",
-    "uri": "http://dx.doi.org/10.13039/501100011103",
-    "name": "Rann\u00eds",
-    "synonym": []
-  },
  {
    "id": "501100012354",
    "uri": "http://dx.doi.org/10.13039/501100012354",
@ -733,12 +698,6 @@
    "name": "Insight SFI Research Centre for Data Analytics",
    "synonym": []
  },
-  {
-    "id": "501100021694",
-    "uri": "http://dx.doi.org/10.13039/501100021694",
-    "name": "Elan Pharma International",
-    "synonym": []
-  },
  {
    "id": "501100021838",
    "uri": "http://dx.doi.org/10.13039/501100021838",
@ -769,12 +728,6 @@
    "name": "Institute of Technology, Tralee",
    "synonym": []
  },
-  {
-    "id": "501100023273",
-    "uri": "http://dx.doi.org/10.13039/501100023273",
-    "name": "HRB Clinical Research Facility Galway",
-    "synonym": []
-  },
  {
    "id": "501100023378",
    "uri": "http://dx.doi.org/10.13039/501100023378",
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
+              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@ -79,23 +79,6 @@ object MagUtility extends Serializable {
  private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)

  private val MAGDataInfo: DataInfo = {
-    val di = new DataInfo
-    di.setDeletedbyinference(false)
-    di.setInferred(false)
-    di.setInvisible(false)
-    di.setTrust("0.9")
-    di.setProvenanceaction(
-      OafMapperUtils.qualifier(
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.DNET_PROVENANCE_ACTIONS,
-        ModelConstants.DNET_PROVENANCE_ACTIONS
-      )
-    )
-    di
-  }
-
-  private val MAGDataInfoInvisible: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
@ -453,7 +436,6 @@ object MagUtility extends Serializable {

      case "repository" =>
        result = new Publication()
-        result.setDataInfo(MAGDataInfoInvisible)
        qualifier(
          "0038",
          "Other literature type",
@ -488,8 +470,7 @@ object MagUtility extends Serializable {
    }

    if (result != null) {
-      if (result.getDataInfo == null)
-        result.setDataInfo(MAGDataInfo)
+      result.setDataInfo(MAGDataInfo)
      val i = new Instance
      i.setInstancetype(tp)
      i.setInstanceTypeMapping(
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
      return null

    result.setCollectedfrom(List(MAGCollectedFrom).asJava)
-    val pidList = List(
+    var pidList = List(
      structuredProperty(
        paper.paperId.get.toString,
        qualifier(
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
      )
    )

-    result.setPid(pidList.asJava)
-
    result.setOriginalId(pidList.map(s => s.getValue).asJava)

    result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@ -618,22 +597,23 @@ object MagUtility extends Serializable {
    }

    val instance = result.getInstance().get(0)
-    instance.setPid(pidList.asJava)
-    if (paper.doi.orNull != null)
-      instance.setAlternateIdentifier(
-        List(
-          structuredProperty(
-            paper.doi.get,
-            qualifier(
-              PidType.doi.toString,
-              PidType.doi.toString,
-              ModelConstants.DNET_PID_TYPES,
-              ModelConstants.DNET_PID_TYPES
-            ),
-            null
-          )
-        ).asJava
+
+    if (paper.doi.orNull != null) {
+      pidList = pidList ::: List(
+        structuredProperty(
+          paper.doi.get,
+          qualifier(
+            PidType.doi.toString,
+            PidType.doi.toString,
+            ModelConstants.DNET_PID_TYPES,
+            ModelConstants.DNET_PID_TYPES
+          ),
+          null
+        )
      )
+    }
+    instance.setPid(pidList.asJava)
+    result.setPid(pidList.asJava)
    instance.setUrl(paper.urls.get.asJava)
    instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
    instance.setCollectedfrom(MAGCollectedFrom)
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
    spark.read
      .load(s"$magBasePath/mag_denormalized")
      .as[MAGPaper]
+      .filter(col("doi").isNotNull)
      .map(s => MagUtility.convertMAGtoOAF(s))
      .filter(s => s != null)
      .write
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.pubmed._
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}

-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory

 object SparkCreateBaselineDataFrame {

@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
+        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)

-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
-    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
-    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
-    log.info("outputBasePath: {}", outputBasePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)

    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)

    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )

-    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
-    val mdStoreSize = df.count
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed

 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}

 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
 					"-pubmedInputPath", crossrefAffiliationRelationPath,
 					"-openapcInputPath", crossrefAffiliationRelationPath,
 					"-dataciteInputPath", crossrefAffiliationRelationPath,
+					"-webCrawlInputPath", crossrefAffiliationRelationPath,
 					"-outputPath", outputPath
 				});

@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
 //            );
 //        }
 		// count the number of relations
-		assertEquals(80, tmp.count());
+		assertEquals(120, tmp.count());

 		Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
 		dataset.createOrReplaceTempView("result");
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
 		// verify that we have equal number of bi-directional relations
 		Assertions
 			.assertEquals(
-				40, execVerification
+				60, execVerification
 					.filter(
 						"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
 					.collectAsList()
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {

 		Assertions
 			.assertEquals(
-				40, execVerification
+				60, execVerification
 					.filter(
 						"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
 					.collectAsList()
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
+					"-format",
+					"COCI"
 				});

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@ -75,7 +75,11 @@ public class CreateASTest {

 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
+			.getPath();
+		String blackListPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
 			.getPath();

 		CreateActionSetFromWebEntries
@ -86,7 +90,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
+					"-blackListPath", blackListPath
 				});

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -96,7 +101,7 @@ public class CreateASTest {
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));

-		Assertions.assertEquals(64, tmp.count());
+		Assertions.assertEquals(58, tmp.count());

 	}

@ -109,6 +114,10 @@ public class CreateASTest {
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
 			.getPath();
+		String blackListPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+			.getPath();

 		CreateActionSetFromWebEntries
 			.main(
@ -118,7 +127,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
+					"-blackListPath", blackListPath
 				});

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -184,7 +194,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getSource()
@ -197,7 +207,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -210,7 +220,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -224,7 +234,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -238,7 +248,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				1, tmp
+				0, tmp
 					.filter(
 						r -> r
 							.getTarget()
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Objects;
+import java.util.stream.Stream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.junit.jupiter.MockitoExtension;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+@ExtendWith(MockitoExtension.class)
+public class FileGZipMultipleNodeTest {
+
+	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
+
+	private final ApiDescriptor api = new ApiDescriptor();
+
+	private FileGZipCollectorPlugin plugin;
+
+	private static final String SPLIT_ON_ELEMENT = "incollection,article";
+
+	@BeforeEach
+	public void setUp() throws IOException {
+
+		final String gzipFile = Objects
+			.requireNonNull(
+				this
+					.getClass()
+					.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
+			.getFile();
+
+		api.setBaseUrl(gzipFile);
+
+		HashMap<String, String> params = new HashMap<>();
+		params.put("splitOnElement", SPLIT_ON_ELEMENT);
+
+		api.setParams(params);
+
+		FileSystem fs = FileSystem.get(new Configuration());
+		plugin = new FileGZipCollectorPlugin(fs);
+	}
+
+	@Test
+	void test() throws CollectorException {
+
+		final Stream<String> stream = plugin.collect(api, new AggregatorReport());
+
+		stream.limit(10).forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			log.info(s);
+		});
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;

 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Stream;

 import org.junit.jupiter.api.Assertions;
@ -35,11 +36,11 @@ public class OsfPreprintCollectorTest {
 	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";

 	private final String resumptionParam = "page";
-	private final String resumptionType = "page";
-	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+	private final String resumptionType = "scan";
+	private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";

-	private final String resultSizeParam = "";
-	private final String resultSizeValue = "";
+	private final String resultSizeParam = "page[size]";
+	private final String resultSizeValue = "100";

 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -69,11 +70,11 @@ public class OsfPreprintCollectorTest {

 	@Test
 	@Disabled
-	void test() throws CollectorException {
+	void test_limited() throws CollectorException {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());

-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Test
+	@Disabled
+	void test_all() throws CollectorException {
+		final AtomicLong i = new AtomicLong(0);
+		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
+
+		stream.forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			if ((i.incrementAndGet() % 1000) == 0) {
+				log.info("COLLECTED: {}", i.get());
+			}
+
+		});
+
+		log.info("TOTAL: {}", i.get());
+		Assertions.assertTrue(i.get() > 0);
+	}
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@ -4,6 +4,11 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@ -25,18 +32,18 @@ class RestCollectorPluginTest {

 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);

-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
-	private final String resumptionType = "count";
-	private final String resumptionParam = "from";
-	private final String entityXpath = "//hits/hits";
-	private final String resumptionXpath = "//hits";
-	private final String resultTotalXpath = "//hits/total";
-	private final String resultFormatParam = "format";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+	private final String resumptionType = "discover";
+	private final String resumptionParam = "skip";
+	private final String entityXpath = "//*[local-name()='data']";
+	private final String resumptionXpath = "";
+	private final String resultTotalXpath = "//*[local-name()='count']";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";

 	private final String protocolDescriptor = "rest_json2xml";
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
+		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");

 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Disabled
+	@Test
+	void testUrl() throws IOException {
+		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+		URL url = new URL(url_s);
+		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestMethod("GET");
+		conn.setRequestProperty("User-Agent", "OpenAIRE");
+		Gson gson = new Gson();
+		System.out.println("Request header");
+		System.out.println(gson.toJson(conn.getHeaderFields()));
+		InputStream inputStream = conn.getInputStream();
+
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@ -44,7 +44,7 @@ public class RestIteratorTest {

 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json
@ -4,4 +4,6 @@
 {"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
 {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
 {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
-{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
+{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
+{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
+{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
-    {
-      "value": "VALUE",
-      "type": "PIPPO"
-    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref

 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }

+  @Test
+  def mappingRecord(): Unit = {
+    val input =
+      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+  }
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test

@ -18,10 +19,8 @@ class MAGMappingTest {
      .master("local[*]")
      .getOrCreate()

-    val s = new SparkMagOrganizationAS(null, null, null)
-
-    s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
-
+    val s = new SparkMAGtoOAF(null, null, null)
+    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
  }

  @Test
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension

 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testEBIData() = {
-    val inputXML = Source
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-      .mkString
-    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }

@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {

-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -53,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
-
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
-            <version>1.0.2</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
-            <version>2.11.0</version>
-        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>com.arakelian</groupId>
-            <artifactId>java-jq</artifactId>
-        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;

@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
-						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));

 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
@ -203,8 +203,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			WindowSpec w = Window
 				.partitionBy("groupId")
 				.orderBy(
-					col("lastUsage").desc_nulls_last(),
 					col("pidType").asc_nulls_last(),
+					col("lastUsage").desc_nulls_last(),
 					col("collectedfrom").desc_nulls_last(),
 					col("date").asc_nulls_last(),
 					col("id").asc_nulls_last());
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
 					"",
-					r._1()._2().getOriginalId().get(0),
-					r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
-					r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
-					r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
-					r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
-					r._1()._2().getCollectedfrom().get(0).getValue(),
+					Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
+					Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
+					Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
 					"",
 					structuredPropertyListToString(r._1()._2().getPid()),
 					parseECField(r._1()._2().getEclegalbody()),
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 					final Organization o = r._2()._2();
 					return new OrgSimRel(
 						r._1()._1(),
-						o.getOriginalId().get(0),
+						Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
 						Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
 					OrgSimRel orgSimRel = r._1()._2();
-					orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+					orgSimRel
+						.setLocal_id(
+							Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
 					return orgSimRel;
 				},
 				Encoders.bean(OrgSimRel.class));
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;

@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");

-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));

 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -0,0 +1,103 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import scala.Tuple2;
+
+class DatasetMergerTest implements Serializable {
+
+	private List<Tuple2<String, Dataset>> datasets;
+
+	private String testEntityBasePath;
+	private DataInfo dataInfo;
+	private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148";
+	private Dataset dataset_top;
+
+	@BeforeEach
+	public void setUp() throws Exception {
+		testEntityBasePath = Paths
+			.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
+			.toFile()
+			.getAbsolutePath();
+
+		datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class);
+
+		dataset_top = getTopPub(datasets);
+
+		dataInfo = setDI();
+	}
+
+	@Test
+	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
+		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
+
+		// verify id
+		assertEquals(dedupId, pub_merged.getId());
+		assertEquals(2, pub_merged.getInstance().size());
+	}
+
+	public DataInfo setDI() {
+		DataInfo dataInfo = new DataInfo();
+		dataInfo.setTrust("0.9");
+		dataInfo.setDeletedbyinference(false);
+		dataInfo.setInferenceprovenance("testing");
+		dataInfo.setInferred(true);
+		return dataInfo;
+	}
+
+	public Dataset getTopPub(List<Tuple2<String, Dataset>> publications) {
+
+		Double maxTrust = 0.0;
+		Dataset maxPub = new Dataset();
+		for (Tuple2<String, Dataset> publication : publications) {
+			Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
+			if (pubTrust > maxTrust) {
+				maxTrust = pubTrust;
+				maxPub = publication._2();
+			}
+		}
+		return maxPub;
+	}
+
+	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+		List<Tuple2<String, T>> res = new ArrayList<>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(path));
+			String line = reader.readLine();
+			while (line != null) {
+				res
+					.add(
+						new Tuple2<>(
+							MapDocumentUtil.getJPathString("$.id", line),
+							new ObjectMapper().readValue(line, clazz)));
+				// read next line
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+
+		return res;
+	}
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable {
 		assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
 		assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
 		assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
-		assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
-		assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
-		assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
-		assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
+		assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid());
+		assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid());
+		assertEquals("Elsevier BV", pub_merged.getPublisher().getValue());
+		assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue());
 		assertEquals(pub_top.getResourcetype().getClassid(), "");
 		assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
 		assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
-		assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
+		// assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
 		assertEquals(3, pub_merged.getInstance().size());
 		assertEquals(2, pub_merged.getCountry().size());
 		assertEquals(0, pub_merged.getSubject().size());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@ -172,7 +172,7 @@ public class SparkBulkTagJob {
 			.option("compression", "gzip")
 			.json(outputPath + "project");

-		readPath(spark, outputPath + "project", Datasource.class)
+		readPath(spark, outputPath + "project", Project.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -50,7 +50,7 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 	 * @param subject
 	 */
 	private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
-												  Subject subject) {
+		Subject subject) {

 		vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
 			if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
@ -61,13 +61,14 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 					subject.getQualifier().setClassname(vocabulary.getName());
 				}
 			} else {
-				final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
-						.map(DataInfo::getProvenanceaction)
-						.map(Qualifier::getClassid)
-						.orElse(null);
+				final String provenanceActionClassId = Optional
+					.ofNullable(subject.getDataInfo())
+					.map(DataInfo::getProvenanceaction)
+					.map(Qualifier::getClassid)
+					.orElse(null);

 				if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
-						!"subject:fos".equals(provenanceActionClassId)) {
+					!"subject:fos".equals(provenanceActionClassId)) {

 					Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
 					VocabularyTerm term = vocabulary.getTerm(subject.getValue());
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 			o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
 			o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
 			o.setCountry(prepareQualifierSplitting(rs.getString("country")));
+			o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology")));
 			o.setDataInfo(info);
 			o.setLastupdatetimestamp(lastUpdateTimestamp);

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -156,6 +156,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -190,6 +191,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -224,6 +226,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -258,6 +261,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -292,6 +296,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -326,6 +331,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -360,6 +366,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -394,6 +401,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -116,17 +116,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=10000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>8000</arg>
+            <arg>--numPartitions</arg><arg>10000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -143,17 +145,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>4000</arg>
+            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -170,11 +174,13 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
@ -197,17 +203,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>300</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -224,17 +232,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=200
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>200</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -251,17 +261,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/organization</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>400</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -278,17 +290,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/project</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -305,17 +319,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/relation</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>10000</arg>
+            <arg>--numPartitions</arg><arg>15000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
@ -45,6 +45,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=15000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -79,6 +80,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=10000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
@ -28,7 +28,8 @@ SELECT
    (array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1]              AS ecinternationalorganization,
    (array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1]                      AS ecenterprise,
    (array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1]                    AS ecsmevalidated,
-    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode
+    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode,
+    org_types.name                                                                                              AS typology
 FROM organizations o
 	LEFT OUTER JOIN acronyms a    ON (a.id = o.id)
 	LEFT OUTER JOIN urls u        ON (u.id = o.id)
@ -37,6 +38,7 @@ FROM organizations o
 	LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
    LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
    LEFT OUTER JOIN other_ids idup  ON (od.id = idup.id)
+    LEFT OUTER JOIN org_types ON (org_types.val = o.type)
 WHERE
    o.status = 'approved' OR o.status = 'suggested'
 GROUP BY
@ -44,4 +46,5 @@ GROUP BY
 	o.name,
 	o.creation_date,
 	o.modification_date,
-	o.country;
+	o.country,
+	org_types.name;
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@ -0,0 +1,5 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@ -0,0 +1,166 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
@ -25,27 +25,38 @@ object SparkApplyHostedByMapToResult {
          val i = p.getInstance().asScala
          if (i.size == 1) {
            val inst: Instance = i.head
-            inst.getHostedby.setKey(ei.getHostedById)
-            inst.getHostedby.setValue(ei.getName)
-            if (ei.getOpenAccess) {
-              inst.setAccessright(
-                OafMapperUtils.accessRight(
-                  ModelConstants.ACCESS_RIGHT_OPEN,
-                  "Open Access",
-                  ModelConstants.DNET_ACCESS_MODES,
-                  ModelConstants.DNET_ACCESS_MODES
-                )
-              )
-              inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
-              p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
-            }
+            patchInstance(p, ei, inst)

+          } else {
+            val cf = i.map(ii => ii.getCollectedfrom.getValue)
+            if (cf.contains("Crossref")) {
+              i.foreach(ii => {
+                patchInstance(p, ei, ii)
+              })
+            }
          }
        }
        p
      })(Encoders.bean(classOf[Publication]))
  }

+  private def patchInstance(p: Publication, ei: EntityInfo, inst: Instance): Unit = {
+    inst.getHostedby.setKey(ei.getHostedById)
+    inst.getHostedby.setValue(ei.getName)
+    if (ei.getOpenAccess) {
+      inst.setAccessright(
+        OafMapperUtils.accessRight(
+          ModelConstants.ACCESS_RIGHT_OPEN,
+          "Open Access",
+          ModelConstants.DNET_ACCESS_MODES,
+          ModelConstants.DNET_ACCESS_MODES
+        )
+      )
+      inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
+      p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
+    }
+  }
+
  def main(args: Array[String]): Unit = {

    val logger: Logger = LoggerFactory.getLogger(getClass)
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@ -0,0 +1,258 @@
+package eu.dnetlib.dhp.sx.graph
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.sx.scholix.{
+  Scholix,
+  ScholixCollectedFrom,
+  ScholixEntityId,
+  ScholixIdentifier,
+  ScholixRelationship,
+  ScholixResource
+}
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+import scala.io.Source
+
+case class RelationInfo(
+  source: String,
+  target: String,
+  relclass: String,
+  id: String,
+  collectedfrom: Seq[RelKeyValue]
+) {}
+case class RelKeyValue(key: String, value: String) {}
+
+object ScholexplorerUtils {
+
+  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
+  val mapper = new ObjectMapper()
+
+  case class RelationVocabulary(original: String, inverse: String) {}
+
+  val relations: Map[String, RelationVocabulary] = {
+    val input = Source
+      .fromInputStream(
+        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
+      )
+      .mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+
+    lazy val json: json4s.JValue = parse(input)
+
+    json.extract[Map[String, RelationVocabulary]]
+  }
+
+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
+  def generateDatasourceOpenAIREURLS(id: String): String = {
+    if (id != null && id.length > 12)
+      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
+    else
+      null
+  }
+
+  def findURLForPID(
+    pidValue: List[StructuredProperty],
+    urls: List[String]
+  ): List[(StructuredProperty, String)] = {
+    pidValue.map { p =>
+      val pv = p.getValue
+
+      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+      (p, r.orNull)
+    }
+  }
+
+  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
+    if (r.getInstance() == null || r.getInstance().isEmpty)
+      return List()
+    r.getInstance()
+      .asScala
+      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+      .filter(i => i.getPid != null && i.getUrl != null)
+      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+      .distinct
+      .toList
+  }
+
+  def generateScholixResourceFromResult(result: Result): ScholixResource = {
+
+    if (result.getInstance() == null || result.getInstance().size() == 0)
+      return null
+
+    if (result.getPid == null || result.getPid.isEmpty)
+      return null
+
+    val r = new ScholixResource
+    r.setDnetIdentifier(result.getId)
+
+    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
+    if (persistentIdentifiers.isEmpty)
+      return null
+
+    r.setIdentifier(persistentIdentifiers.asJava)
+
+    r.setObjectType(result.getResulttype.getClassid)
+
+    r.setObjectSubType(
+      result
+        .getInstance()
+        .asScala
+        .filter(i => i != null && i.getInstancetype != null)
+        .map(i => i.getInstancetype.getClassname)
+        .distinct
+        .head
+    )
+
+    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
+      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
+      if (titles.nonEmpty)
+        r.setTitle(titles.head)
+      else
+        return null
+    }
+    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
+      val authors: List[ScholixEntityId] =
+        result.getAuthor.asScala
+          .map(a => {
+            val entity = new ScholixEntityId()
+            entity.setName(a.getFullname)
+            if (a.getPid != null && a.getPid.size() > 0)
+              entity.setIdentifiers(
+                a.getPid.asScala
+                  .map(sp => {
+                    val id = new ScholixIdentifier()
+                    id.setIdentifier(sp.getValue)
+                    id.setSchema(sp.getQualifier.getClassid)
+                    id
+                  })
+                  .take(3)
+                  .toList
+                  .asJava
+              )
+            entity
+          })
+          .toList
+      if (authors.nonEmpty)
+        r.setCreator(authors.asJava)
+
+    }
+
+    val dt: List[String] = result
+      .getInstance()
+      .asScala
+      .filter(i => i.getDateofacceptance != null)
+      .map(i => i.getDateofacceptance.getValue)
+      .toList
+    if (dt.nonEmpty)
+      r.setPublicationDate(dt.distinct.head)
+
+    r.setPublisher(
+      result
+        .getInstance()
+        .asScala
+        .map(i => i.getHostedby)
+        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
+        .map(h => {
+          val eid = new ScholixEntityId()
+          eid.setName(h.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(h.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          eid
+        })
+        .distinct
+        .asJava
+    )
+
+    r.setCollectedFrom(
+      result.getCollectedfrom.asScala
+        .map(cf => {
+          val scf = new ScholixCollectedFrom()
+          scf.setProvisionMode("collected")
+          scf.setCompletionStatus("complete")
+          val eid = new ScholixEntityId()
+          eid.setName(cf.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(cf.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          scf.setProvider(eid)
+          scf
+        })
+        .asJava
+    )
+
+    r
+  }
+
+  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
+    val s: Scholix = new Scholix
+    s.setSource(source)
+    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
+      s.setLinkprovider(
+        relation.collectedfrom
+          .map(cf => {
+            val eid = new ScholixEntityId()
+            eid.setName(cf.value)
+            val id = new ScholixIdentifier()
+            id.setIdentifier(cf.key)
+            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
+            eid.setIdentifiers(List(id).asJava)
+            eid
+          })
+          .toList
+          .asJava
+      )
+    else {
+      val eid = new ScholixEntityId()
+      eid.setName("OpenAIRE")
+      val id = new ScholixIdentifier()
+      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
+      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
+      eid.setIdentifiers(List(id).asJava)
+      s.setLinkprovider(List(eid).asJava)
+    }
+    s.setIdentifier(relation.id)
+    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
+    if (semanticRelation == null)
+      return null
+    s.setRelationship(
+      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+    )
+    s.setPublicationDate(source.getPublicationDate)
+    s.setPublisher(source.getPublisher)
+    val mockTarget = new ScholixResource
+    mockTarget.setDnetIdentifier(relation.target)
+    s.setTarget(mockTarget)
+    s
+  }
+
+  def updateTarget(s: Scholix, t: ScholixResource): String = {
+
+    s.setTarget(t)
+    val spublishers: Seq[ScholixEntityId] =
+      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
+    val tpublishers: Seq[ScholixEntityId] =
+      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
+    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
+    s.setPublisher(mergedPublishers.asJava)
+    mapper.writeValueAsString(s)
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@ -0,0 +1,141 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import eu.dnetlib.dhp.schema.oaf.{
+  KeyValue,
+  OtherResearchProduct,
+  Publication,
+  Relation,
+  Result,
+  Software,
+  Dataset => OafDataset
+}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
+import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
+  override def run(): Unit = {
+    val sourcePath = parser.get("sourcePath")
+    log.info("sourcePath: {}", sourcePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)
+    generateBidirectionalRelations(sourcePath, targetPath, spark)
+    generateScholixResource(sourcePath, targetPath, spark)
+    generateScholix(targetPath, spark)
+  }
+
+  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
+    val entityMap: Map[String, StructType] = Map(
+      "publication"          -> Encoders.bean(classOf[Publication]).schema,
+      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
+      "software"             -> Encoders.bean(classOf[Software]).schema,
+      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
+    )
+
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
+
+    val resDs = spark.emptyDataset[ScholixResource]
+    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
+      println(s"adding ${item._1}")
+      res.union(
+        spark.read
+          .schema(item._2)
+          .json(s"$inputPath/${item._1}")
+          .as[Result]
+          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
+          .filter(s => s != null)
+      )
+    })
+    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
+  }
+
+  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
+    val relSchema = Encoders.bean(classOf[Relation]).schema
+
+    val relDF = spark.read
+      .schema(relSchema)
+      .json(s"$inputPath/relation")
+      .where(
+        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
+        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
+      )
+      .select("source", "target", "collectedfrom", "relClass")
+
+    def invRel: String => String = { s =>
+      ScholexplorerUtils.invRel(s)
+    }
+
+    import org.apache.spark.sql.functions.udf
+    val inverseRelationUDF = udf(invRel)
+    val inverseRelation = relDF.select(
+      col("target").alias("source"),
+      col("source").alias("target"),
+      col("collectedfrom"),
+      inverseRelationUDF(col("relClass")).alias("relClass")
+    )
+
+    val bidRel = inverseRelation
+      .union(relDF)
+      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
+      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
+      .drop("collectedfrom")
+      .withColumnRenamed("cf", "collectedfrom")
+      .groupBy(col("id"))
+      .agg(
+        first("source").alias("source"),
+        first("target").alias("target"),
+        first("relClass").alias("relClass"),
+        first("collectedfrom").alias("collectedfrom")
+      )
+
+    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
+
+  }
+
+  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
+
+    import spark.implicits._
+    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
+    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
+
+    val scholix_one_verse = relations
+      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
+      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
+
+    val resourceTarget = relations
+      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
+      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
+
+    scholix_one_verse
+      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
+      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
+      .write
+      .mode(SaveMode.Overwrite)
+      .option("compression", "gzip")
+      .text(s"$outputPath/scholix")
+  }
+}
+
+object SparkCreateScholexplorerDump {
+  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
+
+  def main(args: Array[String]): Unit = {
+    new SparkCreateScholexplorerDump(
+      log = logger,
+      args = args,
+      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
+    ).initialize().run()
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@ -0,0 +1,26 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
+import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Test
+import org.objenesis.strategy.StdInstantiatorStrategy
+
+class ScholixGenerationTest {
+
+  @Test
+  def generateScholix(): Unit = {
+
+    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
+    val app = new SparkCreateScholexplorerDump(null, null, null)
+//   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+//    app.generateBidirectionalRelations(
+//      "/home/sandro/Downloads/scholix_sample/",
+//      "/home/sandro/Downloads/scholix/",
+//      spark
+//    )
+    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+
+  }
+}
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -18,7 +18,7 @@
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
+                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
@ -59,12 +59,6 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.slf4j</groupId>
-                    <artifactId>slf4j-api</artifactId>
-                </exclusion>
-            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -160,6 +154,26 @@
                    <groupId>org.apache.zookeeper</groupId>
                    <artifactId>zookeeper</artifactId>
                </exclusion>
+                <exclusion>
+                    <artifactId>ant</artifactId>
+                    <groupId>org.apache.ant</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>antlr4-runtime</artifactId>
+                    <groupId>org.antlr</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>woodstox-core</artifactId>
+                    <groupId>com.fasterxml.woodstox</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>log4j</artifactId>
+                    <groupId>*</groupId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.logging.log4j</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
@ -206,5 +220,90 @@

    </dependencies>

+    <profiles>
+        <profile>
+            <id>spark-24</id>
+            <activation>
+                <activeByDefault>true</activeByDefault>
+            </activation>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-3</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+
+        <profile>
+            <id>spark-34</id>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-4</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+
+        <profile>
+            <id>spark-35</id>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-4</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>

 </project>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
@ -31,7 +31,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.oa.provision.XmlConverterJob;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;

@ -48,7 +47,7 @@ public class IrishOaiExporterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
+					IrishOaiExporterJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
 		parser.parseArgument(args);

--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getTitle()
 						.stream()
+						.filter(t -> StringUtils.isNotBlank(t.getValue()))
 						.findFirst()
-						.map(StructuredProperty::getValue)
 						.ifPresent(
-							title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
+							title -> {
+								re.setTitle(title);
+								re
+									.getTitle()
+									.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
+							});
 				}
 				if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
 					result
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
-import static org.apache.spark.sql.functions.*;

 import java.util.List;
 import java.util.Map;
 import java.util.Optional;

 import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.expressions.UserDefinedFunction;
-import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -45,9 +37,9 @@ import scala.Tuple2;
 /**
 * XmlConverterJob converts the JoinedEntities as XML records
 */
-public class XmlConverterJob {
+public class PayloadConverterJob {

-	private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
+	private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);

 	public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";

@ -56,8 +48,8 @@ public class XmlConverterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
+					PayloadConverterJob.class
+						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
 		parser.parseArgument(args);

 		final Boolean isSparkSessionManaged = Optional
@ -72,6 +64,12 @@ public class XmlConverterJob {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

+		final Boolean validateXML = Optional
+			.ofNullable(parser.get("validateXML"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.FALSE);
+		log.info("validateXML: {}", validateXML);
+
 		final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
 		log.info("contextApiBaseUrl: {}", contextApiBaseUrl);

@ -86,18 +84,19 @@ public class XmlConverterJob {

 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			removeOutputDir(spark, outputPath);
-			convertToXml(
+			createPayloads(
 				spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
-				VocabularyGroup.loadVocsFromIS(isLookup));
+				VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
 		});
 	}

-	private static void convertToXml(
+	private static void createPayloads(
 		final SparkSession spark,
 		final String inputPath,
 		final String outputPath,
 		final ContextMapper contextMapper,
-		final VocabularyGroup vocabularies) {
+		final VocabularyGroup vocabularies,
+		final Boolean validateXML) {

 		final XmlRecordFactory recordFactory = new XmlRecordFactory(
 			prepareAccumulators(spark.sparkContext()),
@ -118,7 +117,7 @@ public class XmlConverterJob {
 			.as(Encoders.kryo(JoinedEntity.class))
 			.map(
 				(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
-					recordFactory.build(je),
+					recordFactory.build(je, validateXML),
 					ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
 			.map(
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@ -2,42 +2,34 @@
 package eu.dnetlib.dhp.oa.provision;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static org.apache.spark.sql.functions.col;

 import java.util.HashSet;
 import java.util.Optional;
-import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;

 import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.FilterFunction;
-import org.apache.spark.api.java.function.FlatMapFunction;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.expressions.Aggregator;
+import org.apache.spark.sql.expressions.Window;
+import org.apache.spark.sql.expressions.WindowSpec;
+import org.apache.spark.sql.functions;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
-import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
-import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
 import eu.dnetlib.dhp.schema.oaf.Relation;
-import scala.Tuple2;

 /**
 * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
@ -130,132 +122,36 @@ public class PrepareRelationsJob {
 	private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
 		Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {

-		JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
-			.filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved")))
-			.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
-			.filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())));
+		WindowSpec source_w = Window
+			.partitionBy("source", "subRelType")
+			.orderBy(col("target").desc_nulls_last());

-		JavaRDD<Relation> pruned = pruneRels(
-			pruneRels(
-				rels,
-				sourceMaxRelations, relPartitions, (Function<Relation, String>) Relation::getSource),
-			targetMaxRelations, relPartitions, (Function<Relation, String>) Relation::getTarget);
-		spark
-			.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
-			.repartition(relPartitions)
-			.write()
-			.mode(SaveMode.Overwrite)
-			.parquet(outputPath);
-	}
+		WindowSpec target_w = Window
+			.partitionBy("target", "subRelType")
+			.orderBy(col("source").desc_nulls_last());

-	private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
-		int relPartitions, Function<Relation, String> idFn) {
-		return rels
-			.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
-			.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
-			.groupBy(Tuple2::_1)
-			.map(Tuple2::_2)
-			.map(t -> Iterables.limit(t, maxRelations))
-			.flatMap(Iterable::iterator)
-			.map(Tuple2::_2);
-	}
-
-	// experimental
-	private static void prepareRelationsDataset(
-		SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
-		int relPartitions) {
 		spark
 			.read()
-			.textFile(inputRelationsPath)
-			.repartition(relPartitions)
-			.map(
-				(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
-				Encoders.kryo(Relation.class))
-			.filter((FilterFunction<Relation>) rel -> !rel.getDataInfo().getDeletedbyinference())
-			.filter((FilterFunction<Relation>) rel -> !relationFilter.contains(rel.getRelClass()))
-			.groupByKey(
-				(MapFunction<Relation, String>) Relation::getSource,
-				Encoders.STRING())
-			.agg(new RelationAggregator(maxRelations).toColumn())
-			.flatMap(
-				(FlatMapFunction<Tuple2<String, RelationList>, Relation>) t -> Iterables
-					.limit(t._2().getRelations(), maxRelations)
-					.iterator(),
-				Encoders.bean(Relation.class))
-			.repartition(relPartitions)
+			.schema(Encoders.bean(Relation.class).schema())
+			.json(inputRelationsPath)
+			.where("source NOT LIKE 'unresolved%' AND  target  NOT LIKE 'unresolved%'")
+			.where("datainfo.deletedbyinference != true")
+			.where(
+				relationFilter.isEmpty() ? ""
+					: "lower(relClass) NOT IN ("
+						+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
+			.withColumn("source_w_pos", functions.row_number().over(source_w))
+			.where("source_w_pos < " + sourceMaxRelations)
+			.drop("source_w_pos")
+			.withColumn("target_w_pos", functions.row_number().over(target_w))
+			.where("target_w_pos < " + targetMaxRelations)
+			.drop("target_w_pos")
+			.coalesce(relPartitions)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.parquet(outputPath);
 	}

-	public static class RelationAggregator
-		extends Aggregator<Relation, RelationList, RelationList> {
-
-		private final int maxRelations;
-
-		public RelationAggregator(int maxRelations) {
-			this.maxRelations = maxRelations;
-		}
-
-		@Override
-		public RelationList zero() {
-			return new RelationList();
-		}
-
-		@Override
-		public RelationList reduce(RelationList b, Relation a) {
-			b.getRelations().add(a);
-			return getSortableRelationList(b);
-		}
-
-		@Override
-		public RelationList merge(RelationList b1, RelationList b2) {
-			b1.getRelations().addAll(b2.getRelations());
-			return getSortableRelationList(b1);
-		}
-
-		@Override
-		public RelationList finish(RelationList r) {
-			return getSortableRelationList(r);
-		}
-
-		private RelationList getSortableRelationList(RelationList b1) {
-			RelationList sr = new RelationList();
-			sr
-				.setRelations(
-					b1
-						.getRelations()
-						.stream()
-						.limit(maxRelations)
-						.collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator()))));
-			return sr;
-		}
-
-		@Override
-		public Encoder<RelationList> bufferEncoder() {
-			return Encoders.kryo(RelationList.class);
-		}
-
-		@Override
-		public Encoder<RelationList> outputEncoder() {
-			return Encoders.kryo(RelationList.class);
-		}
-	}
-
-	/**
-	 * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
-	 * file,
-	 *
-	 * @param spark
-	 * @param inputPath
-	 * @return the JavaRDD<SortableRelation> containing all the relationships
-	 */
-	private static JavaRDD<Relation> readPathRelationRDD(
-		SparkSession spark, final String inputPath) {
-		JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-		return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
-	}
-
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
@ -14,4 +14,7 @@ public class ProvisionConstants {
 		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
 	}

+	public static final String PUBLIC_ALIAS_NAME = "public";
+	public static final String SHADOW_ALIAS_NAME = "shadow";
+
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java
@ -1,44 +0,0 @@
-
-package eu.dnetlib.dhp.oa.provision;
-
-import java.util.Comparator;
-import java.util.Map;
-import java.util.Optional;
-
-import com.google.common.collect.ComparisonChain;
-import com.google.common.collect.Maps;
-
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-
-public class RelationComparator implements Comparator<Relation> {
-
-	private static final Map<String, Integer> weights = Maps.newHashMap();
-
-	static {
-		weights.put(ModelConstants.OUTCOME, 0);
-		weights.put(ModelConstants.SUPPLEMENT, 1);
-		weights.put(ModelConstants.REVIEW, 2);
-		weights.put(ModelConstants.CITATION, 3);
-		weights.put(ModelConstants.AFFILIATION, 4);
-		weights.put(ModelConstants.RELATIONSHIP, 5);
-		weights.put(ModelConstants.PUBLICATION_DATASET, 6);
-		weights.put(ModelConstants.SIMILARITY, 7);
-
-		weights.put(ModelConstants.PROVISION, 8);
-		weights.put(ModelConstants.PARTICIPATION, 9);
-		weights.put(ModelConstants.DEDUP, 10);
-	}
-
-	private Integer getWeight(Relation o) {
-		return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
-	}
-
-	@Override
-	public int compare(Relation o1, Relation o2) {
-		return ComparisonChain
-			.start()
-			.compare(getWeight(o1), getWeight(o2))
-			.result();
-	}
-}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java
@ -1,25 +0,0 @@
-
-package eu.dnetlib.dhp.oa.provision;
-
-import java.io.Serializable;
-import java.util.PriorityQueue;
-import java.util.Queue;
-
-import eu.dnetlib.dhp.schema.oaf.Relation;
-
-public class RelationList implements Serializable {
-
-	private Queue<Relation> relations;
-
-	public RelationList() {
-		this.relations = new PriorityQueue<>(new RelationComparator());
-	}
-
-	public Queue<Relation> getRelations() {
-		return relations;
-	}
-
-	public void setRelations(Queue<Relation> relations) {
-		this.relations = relations;
-	}
-}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
@ -9,6 +9,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.solr.client.solrj.SolrResponse;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,7 +24,7 @@ public class SolrAdminApplication implements Closeable {
 	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);

 	enum Action {
-		DELETE_BY_QUERY, COMMIT
+		DELETE_BY_QUERY, COMMIT, UPDATE_ALIASES
 	}

 	private final CloudSolrClient solrClient;
@ -39,9 +40,6 @@ public class SolrAdminApplication implements Closeable {
 		final String isLookupUrl = parser.get("isLookupUrl");
 		log.info("isLookupUrl: {}", isLookupUrl);

-		final String format = parser.get("format");
-		log.info("format: {}", format);
-
 		final Action action = Action.valueOf(parser.get("action"));
 		log.info("action: {}", action);

@ -59,11 +57,21 @@ public class SolrAdminApplication implements Closeable {
 		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

-		final String collection = ProvisionConstants.getCollectionName(format);
-		log.info("collection: {}", collection);
+		final String publicFormat = parser.get("publicFormat");
+		log.info("publicFormat: {}", publicFormat);
+
+		final String shadowFormat = parser.get("shadowFormat");
+		log.info("shadowFormat: {}", shadowFormat);
+
+		// get collection names from metadata format profiles names
+		final String publicCollection = ProvisionConstants.getCollectionName(publicFormat);
+		log.info("publicCollection: {}", publicCollection);
+
+		final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
+		log.info("shadowCollection: {}", shadowCollection);

 		try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
-			app.execute(action, collection, query, commit);
+			app.execute(action, query, commit, publicCollection, shadowCollection);
 		}
 	}

@ -72,22 +80,29 @@ public class SolrAdminApplication implements Closeable {
 		this.solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()).build();
 	}

-	public SolrResponse commit(String collection) throws IOException, SolrServerException {
-		return execute(Action.COMMIT, collection, null, true);
+	public SolrResponse commit(String shadowCollection) throws IOException, SolrServerException {
+		return execute(Action.COMMIT, null, true, null, shadowCollection);
 	}

-	public SolrResponse execute(Action action, String collection, String query, boolean commit)
+	public SolrResponse execute(Action action, String query, boolean commit,
+		String publicCollection, String shadowCollection)
 		throws IOException, SolrServerException {
 		switch (action) {

 			case DELETE_BY_QUERY:
-				UpdateResponse rsp = solrClient.deleteByQuery(collection, query);
+				UpdateResponse rsp = solrClient.deleteByQuery(shadowCollection, query);
 				if (commit) {
-					solrClient.commit(collection);
+					return solrClient.commit(shadowCollection);
 				}
 				return rsp;
+
 			case COMMIT:
-				return solrClient.commit(collection);
+				return solrClient.commit(shadowCollection);
+
+			case UPDATE_ALIASES:
+				this.updateAliases(publicCollection, shadowCollection);
+				return null;
+
 			default:
 				throw new IllegalArgumentException("action not managed: " + action);
 		}
@ -98,4 +113,28 @@ public class SolrAdminApplication implements Closeable {
 		solrClient.close();
 	}

+	private void updateAliases(String publicCollection, String shadowCollection)
+		throws SolrServerException, IOException {
+
+		// delete current aliases
+		this.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
+		this.deleteAlias(ProvisionConstants.SHADOW_ALIAS_NAME);
+
+		// create aliases
+		this.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, publicCollection);
+		this.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, shadowCollection);
+
+	}
+
+	public SolrResponse deleteAlias(String aliasName) throws SolrServerException, IOException {
+		CollectionAdminRequest.DeleteAlias deleteAliasRequest = CollectionAdminRequest.deleteAlias(aliasName);
+		return deleteAliasRequest.process(solrClient);
+	}
+
+	public SolrResponse createAlias(String aliasName, String collection) throws IOException, SolrServerException {
+		CollectionAdminRequest.CreateAlias createAliasRequest = CollectionAdminRequest
+			.createAlias(aliasName, collection);
+		return createAliasRequest.process(solrClient);
+	}
+
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrRecordDumpJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrRecordDumpJob.java
@ -36,7 +36,7 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {

 	private final String inputPath;

-	private final String format;
+	private final String shadowFormat;

 	private final String outputPath;

@ -61,8 +61,8 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);

-		final String format = parser.get("format");
-		log.info("format: {}", format);
+		final String shadowFormat = parser.get("shadowFormat");
+		log.info("shadowFormat: {}", shadowFormat);

 		final String outputPath = Optional
 			.ofNullable(parser.get("outputPath"))
@ -95,27 +95,24 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
 				final String isLookupUrl = parser.get("isLookupUrl");
 				log.info("isLookupUrl: {}", isLookupUrl);
 				final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
-				new SolrRecordDumpJob(spark, inputPath, format, outputPath).run(isLookup);
+				new SolrRecordDumpJob(spark, inputPath, shadowFormat, outputPath).run(isLookup);
 			});
 	}

-	public SolrRecordDumpJob(SparkSession spark, String inputPath, String format, String outputPath) {
+	public SolrRecordDumpJob(SparkSession spark, String inputPath, String shadowFormat, String outputPath) {
 		this.spark = spark;
 		this.inputPath = inputPath;
-		this.format = format;
+		this.shadowFormat = shadowFormat;
 		this.outputPath = outputPath;
 	}

 	public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
-		final String fields = isLookup.getLayoutSource(format);
+		final String fields = isLookup.getLayoutSource(shadowFormat);
 		log.info("fields: {}", fields);

 		final String xslt = isLookup.getLayoutTransformer();

-		final String dsId = isLookup.getDsId(format);
-		log.info("dsId: {}", dsId);
-
-		final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
+		final String indexRecordXslt = getLayoutTransformer(shadowFormat, fields, xslt);
 		log.info("indexRecordTransformer {}", indexRecordXslt);

 		final Encoder<TupleWrapper> encoder = Encoders.bean(TupleWrapper.class);
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java
@ -1,81 +0,0 @@
-
-package eu.dnetlib.dhp.oa.provision;
-
-import java.io.Serializable;
-import java.util.Map;
-import java.util.Optional;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.google.common.collect.ComparisonChain;
-import com.google.common.collect.Maps;
-
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-
-public class SortableRelation extends Relation implements Comparable<SortableRelation>, Serializable {
-
-	private static final Map<String, Integer> weights = Maps.newHashMap();
-
-	static {
-		weights.put(ModelConstants.OUTCOME, 0);
-		weights.put(ModelConstants.SUPPLEMENT, 1);
-		weights.put(ModelConstants.REVIEW, 2);
-		weights.put(ModelConstants.CITATION, 3);
-		weights.put(ModelConstants.AFFILIATION, 4);
-		weights.put(ModelConstants.RELATIONSHIP, 5);
-		weights.put(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, 6);
-		weights.put(ModelConstants.SIMILARITY, 7);
-
-		weights.put(ModelConstants.PROVISION, 8);
-		weights.put(ModelConstants.PARTICIPATION, 9);
-		weights.put(ModelConstants.DEDUP, 10);
-	}
-
-	private static final long serialVersionUID = 34753984579L;
-
-	private String groupingKey;
-
-	public static SortableRelation create(Relation r, String groupingKey) {
-		SortableRelation sr = new SortableRelation();
-		sr.setGroupingKey(groupingKey);
-		sr.setSource(r.getSource());
-		sr.setTarget(r.getTarget());
-		sr.setRelType(r.getRelType());
-		sr.setSubRelType(r.getSubRelType());
-		sr.setRelClass(r.getRelClass());
-		sr.setDataInfo(r.getDataInfo());
-		sr.setCollectedfrom(r.getCollectedfrom());
-		sr.setLastupdatetimestamp(r.getLastupdatetimestamp());
-		sr.setProperties(r.getProperties());
-		sr.setValidated(r.getValidated());
-		sr.setValidationDate(r.getValidationDate());
-
-		return sr;
-	}
-
-	@JsonIgnore
-	public Relation asRelation() {
-		return this;
-	}
-
-	@Override
-	public int compareTo(SortableRelation o) {
-		return ComparisonChain
-			.start()
-			.compare(getGroupingKey(), o.getGroupingKey())
-			.compare(getWeight(this), getWeight(o))
-			.result();
-	}
-
-	private Integer getWeight(SortableRelation o) {
-		return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
-	}
-
-	public String getGroupingKey() {
-		return groupingKey;
-	}
-
-	public void setGroupingKey(String groupingKey) {
-		this.groupingKey = groupingKey;
-	}
-}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
 import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
+import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -39,6 +40,8 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {

 	private final String format;

+	private final String shadowCollection;
+
 	private final int batchSize;

 	private final SparkSession spark;
@ -62,8 +65,11 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);

-		final String format = parser.get("format");
-		log.info("format: {}", format);
+		final String shadowFormat = parser.get("shadowFormat");
+		log.info("shadowFormat: {}", shadowFormat);
+
+		final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
+		log.info("shadowCollection: {}", shadowCollection);

 		final Integer batchSize = Optional
 			.ofNullable(parser.get("batchSize"))
@ -84,15 +90,17 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
 				final String isLookupUrl = parser.get("isLookupUrl");
 				log.info("isLookupUrl: {}", isLookupUrl);
 				final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
-				new XmlIndexingJob(spark, inputPath, format, batchSize)
+				new XmlIndexingJob(spark, inputPath, shadowFormat, shadowCollection, batchSize)
 					.run(isLookup);
 			});
 	}

-	public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize) {
+	public XmlIndexingJob(SparkSession spark, String inputPath, String format, String shadowCollection,
+		Integer batchSize) {
 		this.spark = spark;
 		this.inputPath = inputPath;
 		this.format = format;
+		this.shadowCollection = shadowCollection;
 		this.batchSize = batchSize;
 	}

@ -102,12 +110,6 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {

 		final String xslt = isLookup.getLayoutTransformer();

-		final String dsId = isLookup.getDsId(format);
-		log.info("dsId: {}", dsId);
-
-		final String collection = ProvisionConstants.getCollectionName(format);
-		log.info("collection: {}", collection);
-
 		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

@ -129,7 +131,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
 			.javaRDD()
 			.map(
 				t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
-		SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+		DHPSolrSupport.indexDocs(zkHost, shadowCollection, batchSize, docs.rdd());
 	}

 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
@ -1,8 +1,6 @@

 package eu.dnetlib.dhp.oa.provision.model;

-import static org.apache.commons.lang3.StringUtils.substringBefore;
-
 import java.io.StringReader;
 import java.util.*;
 import java.util.stream.Collectors;
@ -16,16 +14,15 @@ import org.jetbrains.annotations.Nullable;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;

 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
-import eu.dnetlib.dhp.oa.provision.RelationList;
-import eu.dnetlib.dhp.oa.provision.SortableRelation;
 import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.solr.*;
 import eu.dnetlib.dhp.schema.solr.AccessRight;
 import eu.dnetlib.dhp.schema.solr.Author;
@ -33,8 +30,10 @@ import eu.dnetlib.dhp.schema.solr.Context;
 import eu.dnetlib.dhp.schema.solr.Country;
 import eu.dnetlib.dhp.schema.solr.Datasource;
 import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines;
+import eu.dnetlib.dhp.schema.solr.ExternalReference;
 import eu.dnetlib.dhp.schema.solr.Instance;
 import eu.dnetlib.dhp.schema.solr.Journal;
+import eu.dnetlib.dhp.schema.solr.Measure;
 import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
 import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
 import eu.dnetlib.dhp.schema.solr.Organization;
@ -55,10 +54,7 @@ public class ProvisionModelSupport {
 					.newArrayList(
 						RelatedEntityWrapper.class,
 						JoinedEntity.class,
-						RelatedEntity.class,
-						SortableRelationKey.class,
-						SortableRelation.class,
-						RelationList.class));
+						RelatedEntity.class));
 		return modelClasses.toArray(new Class[] {});
 	}

@ -74,10 +70,15 @@ public class ProvisionModelSupport {
 			.setHeader(
 				SolrRecordHeader
 					.newInstance(
-						e.getId(), e.getOriginalId(), type, deletedbyinference));
+						StringUtils
+							.substringAfter(
+								e.getId(),
+								IdentifierFactory.ID_PREFIX_SEPARATOR),
+						e.getOriginalId(), type, deletedbyinference));
 		r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
 		r.setContext(asContext(e.getContext(), contextMapper));
 		r.setPid(asPid(e.getPid()));
+		r.setMeasures(mapMeasures(e.getMeasures()));

 		if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
 			r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
@ -108,13 +109,24 @@ public class ProvisionModelSupport {
 		final RelatedEntity re = rew.getTarget();
 		final RecordType relatedRecordType = RecordType.valueOf(re.getType());
 		final Relation relation = rew.getRelation();
+		final String relationProvenance = Optional
+			.ofNullable(relation.getDataInfo())
+			.map(
+				d -> Optional
+					.ofNullable(d.getProvenanceaction())
+					.map(Qualifier::getClassid)
+					.orElse(null))
+			.orElse(null);
 		rr
 			.setHeader(
 				RelatedRecordHeader
 					.newInstance(
 						relation.getRelType(),
 						relation.getRelClass(),
-						relation.getTarget(), relatedRecordType));
+						StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
+						relatedRecordType,
+						relationProvenance,
+						Optional.ofNullable(relation.getDataInfo()).map(DataInfo::getTrust).orElse(null)));

 		rr.setAcronym(re.getAcronym());
 		rr.setCode(re.getCode());
@ -132,11 +144,20 @@ public class ProvisionModelSupport {
 		rr.setOfficialname(re.getOfficialname());
 		rr.setOpenairecompatibility(mapCodeLabel(re.getOpenairecompatibility()));
 		rr.setPid(asPid(re.getPid()));
-		rr.setProjectTitle(rr.getProjectTitle());
+		rr.setWebsiteurl(re.getWebsiteurl());
+		rr.setProjectTitle(re.getProjectTitle());
 		rr.setPublisher(re.getPublisher());
 		rr.setResulttype(mapQualifier(re.getResulttype()));
 		rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));

+		if (relation.getValidated() == null) {
+			relation.setValidated(false);
+		}
+		if (ModelConstants.OUTCOME.equals(relation.getSubRelType())
+			&& StringUtils.isNotBlank(relation.getValidationDate())) {
+			rr.setValidationDate(relation.getValidationDate());
+		}
+
 		return rr;
 	}

@ -147,6 +168,7 @@ public class ProvisionModelSupport {
 		ps.setContracttype(mapCodeLabel(p.getContracttype()));
 		ps.setCurrency(mapField(p.getCurrency()));
 		ps.setDuration(mapField(p.getDuration()));
+		ps.setOamandatepublications(mapField(p.getOamandatepublications()));
 		ps.setCallidentifier(mapField(p.getCallidentifier()));
 		ps.setEcarticle29_3(mapField(p.getEcarticle29_3()));
 		ps.setEnddate(mapField(p.getEnddate()));
@ -266,6 +288,7 @@ public class ProvisionModelSupport {
 		ds.setOfficialname(mapField(d.getOfficialname()));
 		ds.setDescription(mapField(d.getDescription()));
 		ds.setJournal(mapJournal(d.getJournal()));
+		ds.setWebsiteurl(mapField(d.getWebsiteurl()));
 		ds.setLogourl(mapField(d.getLogourl()));
 		ds.setAccessinfopackage(mapFieldList(d.getAccessinfopackage()));
 		ds.setCertificates(mapField(d.getCertificates()));
@ -311,6 +334,7 @@ public class ProvisionModelSupport {
 		ds.setSubjects(asSubjectSP(d.getSubjects()));
 		ds.setSubmissionpolicyurl(d.getSubmissionpolicyurl());
 		ds.setThematic(d.getThematic());
+		ds.setContentpolicies(mapCodeLabel(d.getContentpolicies()));
 		ds.setVersioncontrol(d.getVersioncontrol());
 		ds.setVersioning(mapField(d.getVersioning()));

@ -326,6 +350,7 @@ public class ProvisionModelSupport {
 		rs.setOtherTitles(getOtherTitles(r.getTitle()));
 		rs.setDescription(mapFieldList(r.getDescription()));
 		rs.setSubject(asSubject(r.getSubject()));
+		rs.setLanguage(asLanguage(r.getLanguage()));
 		rs.setPublicationdate(mapField(r.getDateofacceptance()));
 		rs.setPublisher(mapField(r.getPublisher()));
 		rs.setEmbargoenddate(mapField(r.getEmbargoenddate()));
@ -341,17 +366,17 @@ public class ProvisionModelSupport {
 		rs.setCountry(asCountry(r.getCountry()));
 		rs.setEoscifguidelines(asEOSCIF(r.getEoscifguidelines()));

-		rs.setGreen(r.getIsGreen());
+		rs.setIsGreen(r.getIsGreen());
 		rs
 			.setOpenAccessColor(
 				Optional
 					.ofNullable(r.getOpenAccessColor())
 					.map(color -> OpenAccessColor.valueOf(color.toString()))
 					.orElse(null));
-		rs.setInDiamondJournal(r.getIsInDiamondJournal());
+		rs.setIsInDiamondJournal(r.getIsInDiamondJournal());
 		rs.setPubliclyFunded(r.getPubliclyFunded());
 		rs.setTransformativeAgreement(r.getTransformativeAgreement());
-
+		rs.setExternalReference(mapExternalReference(r.getExternalReference()));
 		rs.setInstance(mapInstances(r.getInstance()));

 		if (r instanceof Publication) {
@ -375,6 +400,13 @@ public class ProvisionModelSupport {
 		return rs;
 	}

+	private static Language asLanguage(Qualifier lang) {
+		return Optional
+			.ofNullable(lang)
+			.map(q -> Language.newInstance(q.getClassid(), q.getClassname()))
+			.orElse(null);
+	}
+
 	@Nullable
 	private static List<String> getOtherTitles(List<StructuredProperty> titleList) {
 		return Optional
@ -387,7 +419,7 @@ public class ProvisionModelSupport {
 							.equals(
 								Optional
 									.ofNullable(t.getQualifier())
-									.map(Qualifier::getClassid)
+									.map(Qualifier::getClassname)
 									.orElse(null)))
 					.map(StructuredProperty::getValue)
 					.collect(Collectors.toList()))
@ -405,7 +437,7 @@ public class ProvisionModelSupport {
 							.equals(
 								Optional
 									.ofNullable(t.getQualifier())
-									.map(Qualifier::getClassid)
+									.map(Qualifier::getClassname)
 									.orElse(null)))
 					.map(StructuredProperty::getValue)
 					.findFirst())
@ -422,7 +454,7 @@ public class ProvisionModelSupport {
 						Instance i = new Instance();
 						i.setCollectedfrom(asProvenance(instance.getCollectedfrom()));
 						i.setHostedby(asProvenance(instance.getHostedby()));
-						i.setFulltext(i.getFulltext());
+						i.setFulltext(instance.getFulltext());
 						i.setPid(asPid(instance.getPid()));
 						i.setAlternateIdentifier(asPid(instance.getAlternateIdentifier()));
 						i.setAccessright(mapAccessRight(instance.getAccessright()));
@ -453,7 +485,8 @@ public class ProvisionModelSupport {
 	private static AccessRight mapAccessRight(eu.dnetlib.dhp.schema.oaf.AccessRight accessright) {
 		return AccessRight
 			.newInstance(
-				mapQualifier(accessright),
+				accessright.getClassid(),
+				accessright.getClassname(),
 				Optional
 					.ofNullable(accessright.getOpenAccessRoute())
 					.map(route -> OpenAccessRoute.valueOf(route.toString()))
@ -472,7 +505,7 @@ public class ProvisionModelSupport {
 	}

 	private static String mapQualifier(eu.dnetlib.dhp.schema.oaf.Qualifier q) {
-		return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null);
+		return Optional.ofNullable(q).map(Qualifier::getClassname).orElse(null);
 	}

 	private static Journal mapJournal(eu.dnetlib.dhp.schema.oaf.Journal joaf) {
@ -508,7 +541,46 @@ public class ProvisionModelSupport {
 	}

 	private static Provenance asProvenance(KeyValue keyValue) {
-		return Optional.ofNullable(keyValue).map(cf -> Provenance.newInstance(cf.getKey(), cf.getValue())).orElse(null);
+		return Optional
+			.ofNullable(keyValue)
+			.map(
+				kv -> Provenance
+					.newInstance(
+						StringUtils.substringAfter(kv.getKey(), IdentifierFactory.ID_PREFIX_SEPARATOR),
+						kv.getValue()))
+			.orElse(null);
+	}
+
+	private static List<Measure> mapMeasures(List<eu.dnetlib.dhp.schema.oaf.Measure> measures) {
+		return Optional
+			.ofNullable(measures)
+			.map(
+				ml -> ml
+					.stream()
+					.map(m -> Measure.newInstance(m.getId(), mapCodeLabelKV(m.getUnit())))
+					.collect(Collectors.toList()))
+			.orElse(null);
+	}
+
+	private static List<ExternalReference> mapExternalReference(
+		List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) {
+		return Optional
+			.ofNullable(externalReference)
+			.map(
+				ext -> ext
+					.stream()
+					.map(
+						e -> ExternalReference
+							.newInstance(
+								e.getSitename(),
+								e.getLabel(),
+								e.getAlternateLabel(),
+								e.getUrl(),
+								mapCodeLabel(e.getQualifier()),
+								e.getRefidentifier(),
+								e.getQuery()))
+					.collect(Collectors.toList()))
+			.orElse(Lists.newArrayList());
 	}

 	private static List<Context> asContext(List<eu.dnetlib.dhp.schema.oaf.Context> ctxList,
@ -529,7 +601,7 @@ public class ProvisionModelSupport {
 		}

 		return Optional
-			.ofNullable(contexts)
+			.of(contexts)
 			.map(
 				ctx -> ctx
 					.stream()
@ -581,7 +653,14 @@ public class ProvisionModelSupport {
 			.map(
 				pids -> pids
 					.stream()
-					.map(p -> Pid.newInstance(p.getQualifier().getClassid(), p.getValue()))
+					.filter(p -> Objects.nonNull(p.getQualifier()))
+					.filter(p -> Objects.nonNull(p.getQualifier().getClassid()))
+					.map(
+						p -> Pid
+							.newInstance(
+								p.getValue(),
+								p.getQualifier().getClassid(),
+								p.getQualifier().getClassname()))
 					.collect(Collectors.toList()))
 			.orElse(null);
 	}
@ -606,8 +685,10 @@ public class ProvisionModelSupport {
 				subjects -> subjects
 					.stream()
 					.filter(s -> Objects.nonNull(s.getQualifier()))
-					.filter(s -> Objects.nonNull(s.getQualifier().getClassid()))
-					.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid()))
+					.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
+					.map(
+						s -> Subject
+							.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
 					.collect(Collectors.toList()))
 			.orElse(null);
 	}
@ -619,8 +700,10 @@ public class ProvisionModelSupport {
 				subjects -> subjects
 					.stream()
 					.filter(s -> Objects.nonNull(s.getQualifier()))
-					.filter(s -> Objects.nonNull(s.getQualifier().getClassid()))
-					.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid()))
+					.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
+					.map(
+						s -> Subject
+							.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
 					.collect(Collectors.toList()))
 			.orElse(null);
 	}
@ -689,7 +772,7 @@ public class ProvisionModelSupport {
 	private static CodeLabel mapCodeLabel(KeyValue kv) {
 		return Optional
 			.ofNullable(kv)
-			.map(q -> CodeLabel.newInstance(kv.getKey(), kv.getValue()))
+			.map(k -> CodeLabel.newInstance(k.getKey(), k.getValue()))
 			.orElse(null);
 	}

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Miriam Baglioni	a2b708bb71	[AffiliationIngestion]refactoring	2024-06-29 18:36:47 +02:00
Miriam Baglioni	9cbe966b4a	[AffiliationIngestion]refactoring	2024-06-29 18:35:49 +02:00
Miriam Baglioni	236b64d830	[AffiliationIngestion]Extended the ingestion of affiliation from open aire to include also links derived from Web Crawl. Extended the test. Inserted in Constatns the id and name of the webcrawl datasource to be used here and also in the ingestion of links from web crawl	2024-06-29 18:29:20 +02:00
Claudio Atzori	14539f9c8b	[graph provision] publicFormat worfklow parameter defined as optional	2024-06-28 14:55:18 +02:00
Claudio Atzori	1bc8c5d173	[graph provision] fixed serialization of the instancetypes	2024-06-28 14:54:28 +02:00
Claudio Atzori	1ccf01cdb8	Using the updated Solr JSON payload model classes	2024-06-28 12:38:07 +02:00
Claudio Atzori	b79cb155ba	Merge pull request 'Fix permissions-issue in Stats-workflow, step22a-createPDFsAggregated.' (#450 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#450	2024-06-26 10:11:34 +02:00
Claudio Atzori	33a02c5b9e	Merge pull request 'Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to converge to records having DOI pid' (#446 ) from pivotselectionbypid into beta Reviewed-on: D-Net/dnet-hadoop#446	2024-06-26 10:10:13 +02:00
Claudio Atzori	1182bca9eb	Merge pull request 'Add support to cretate/update solr collection aliases' (#449 ) from 9872-create-solr-collection-aliases into beta Reviewed-on: D-Net/dnet-hadoop#449	2024-06-26 10:09:51 +02:00
Claudio Atzori	1c30eacac2	updated index feeding procedure to exploit the collection aliases	2024-06-25 15:27:38 +02:00
Claudio Atzori	6055212f77	merged from the json_payload branch	2024-06-25 12:39:02 +02:00
Claudio Atzori	0031cf849e	Merge branch 'beta' into 9872-create-solr-collection-aliases	2024-06-25 09:58:01 +02:00
Serafeim Chatzopoulos	9f6e16a03c	Add support to cretate/update solr collection aliases	2024-06-20 16:03:15 +03:00
Lampros Smyrnaios	66cd28f70a	- Fix not using the "export HADOOP_USER_NAME" statement in "createPDFsAggregated.sh", which caused permission-issues when creating tables with Impala. - Remove unused "--user" parameter in "impala-shell" calls. - Code polishing.	2024-06-20 14:33:46 +03:00
Lampros Smyrnaios	c6b1ab2a18	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-06-20 14:33:05 +03:00
Miriam Baglioni	d35edac212	[IrishFunderList]make changed according to 9635 comment 20, 21, 22 and 23	2024-06-20 12:28:28 +02:00
Miriam Baglioni	6421f8fece	Merge remote-tracking branch 'origin/beta' into beta	2024-06-19 11:12:15 +02:00
Miriam Baglioni	ac270f795b	[IrishFunderList]make changed according to 9635 comment 14, 15 and 16	2024-06-19 11:11:52 +02:00
Lampros Smyrnaios	236aed8954	Merge remote-tracking branch 'origin/beta' into beta	2024-06-18 17:12:35 +03:00
Claudio Atzori	dd541f8cf5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster.' (#447 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#447	2024-06-18 15:52:30 +02:00
Lampros Smyrnaios	ff335578ea	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-06-18 14:52:31 +03:00
Lampros Smyrnaios	285416c74e	Merge branch 'beta' into beta	2024-06-18 13:50:38 +02:00
Lampros Smyrnaios	3095047e5e	Miscellaneous updates to the copying operation to Impala Cluster: - Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false". - Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR". - Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep". - Add error-handling for "compute stats" errors.	2024-06-18 14:40:41 +03:00
Antonis Lempesis	0456f1b788	Merge remote-tracking branch 'origin/beta' into beta	2024-06-14 15:11:30 +03:00
Antonis Lempesis	38636942c7	filtering out deletedbyinference and invinsible results from accessroute	2024-06-14 15:11:19 +03:00
Lampros Smyrnaios	d942a1101b	Miscellaneous updates to the copying operation to Impala Cluster: - Show some counts and the elapsed time for various sub-tasks. - Code polishing.	2024-06-14 12:14:38 +03:00
Giambattista Bloisi	9bf2bda1c6	Fix: next returned a null value at end of stream	2024-06-12 13:28:51 +02:00
Giambattista Bloisi	d90cb099b8	Fix for paginationStart parameter management	2024-06-11 20:23:44 +02:00
Giambattista Bloisi	4f2a61e10f	Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to slowly converge to records having DOI pid	2024-06-11 15:33:56 +02:00
Claudio Atzori	11fe3a4fe0	[graph resolution] use sparkExecutorMemory to define also the memoryOverhead	2024-06-11 14:21:17 +02:00
Claudio Atzori	a8d68c9d29	avoid NPEs	2024-06-11 14:19:24 +02:00
Miriam Baglioni	8fe934810f	Merge remote-tracking branch 'origin/beta' into beta	2024-06-11 10:28:51 +02:00
Miriam Baglioni	9da006e98c	[SDGFoSActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance	2024-06-11 10:28:32 +02:00
Giambattista Bloisi	85c1eae7e0	Fixes for pagination strategy looping at end of download	2024-06-10 19:03:58 +02:00
Claudio Atzori	b0eba210c0	[actionset promotion] use sparkExecutorMemory to define also the memoryOverhead	2024-06-10 16:15:24 +02:00
Claudio Atzori	3776327a8c	hostedby patching to work with the updated Crossref contents, resolved conflict	2024-06-10 15:24:12 +02:00
Claudio Atzori	0139f23d66	Merge pull request 'organization type from OpenOrgs' (#445 ) from import_openorg_type into beta Reviewed-on: D-Net/dnet-hadoop#445	2024-06-07 12:17:31 +02:00
Michele Artini	c726572418	changed some parameters in OSF test	2024-06-07 12:03:26 +02:00
Claudio Atzori	ec79405cc9	[graph raw] set organization type from openorgs	2024-06-07 11:30:31 +02:00
Miriam Baglioni	1477406ecc	[bulkTag] fixed issue that made project disappear in graph_10_enriched	2024-06-06 10:45:41 +02:00
Claudio Atzori	92c3abd5a4	[graph cleaning] use sparkExecutorMemory to define also the memoryOverhead	2024-06-06 10:44:33 +02:00
Claudio Atzori	ce2364743a	applying changes from PR#442: Fix for missing collectedfrom after dedup	2024-06-06 10:43:43 +02:00
Claudio Atzori	f70dc76b61	minor	2024-06-06 10:43:10 +02:00
Claudio Atzori	73bd1938a5	[graph2hive] use sparkExecutorMemory to define also the memoryOverhead	2024-06-05 12:17:35 +02:00
Claudio Atzori	da5c1e73a4	Merge pull request 'Irish oaipmh exporter' (#443 ) from irish-oaipmh-exporter into beta Reviewed-on: D-Net/dnet-hadoop#443	2024-06-05 10:55:09 +02:00
Claudio Atzori	a02f3f0d2b	code formatting	2024-05-30 10:21:18 +02:00
Alessia Bardi	eadfd8d71d	Merge pull request 'Updated XMLIterator for splitting on different nodes' (#436 ) from dblp_collection_plugin into beta Reviewed-on: D-Net/dnet-hadoop#436	2024-05-29 16:05:06 +02:00
Alessia Bardi	05ee783c07	Merge branch 'beta' into dblp_collection_plugin	2024-05-29 16:04:39 +02:00
Alessia Bardi	fe9fb59c90	Merge pull request 'Rest collector plugin on hadoop supports a new param to pass request headers' (#441 ) from rest-collector-request-header-map into beta Reviewed-on: D-Net/dnet-hadoop#441	2024-05-29 15:54:39 +02:00
Claudio Atzori	c272c4ad68	code formatting	2024-05-29 15:50:07 +02:00
Alessia Bardi	c5f4da16a4	Merge branch 'beta' into rest-collector-request-header-map	2024-05-29 15:46:23 +02:00
Alessia	1b165a14a0	Rest collector plugin on hadoop supports a new param to pass request headers	2024-05-29 15:41:36 +02:00
Michele Artini	e996787be2	OSF test	2024-05-29 15:05:17 +02:00
Claudio Atzori	62716141c5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster' (#440 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#440	2024-05-29 14:34:51 +02:00
Miriam Baglioni	5d85b70e1f	[NOAMI] removed Ireland funder id 501100011103. ticket 9635	2024-05-29 11:55:00 +02:00
Lampros Smyrnaios	e3f28338c1	Miscellaneous updates to the copying operation to Impala Cluster: - Assign the WRITE and EXECUTE permissions to the DBs' HDFS-directories, in order to be able to create tables on top of them, in the Impala Cluster. - Make sure the "copydb" function returns early, when it encounters a fatal error, while respecting the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config.	2024-05-28 17:51:45 +03:00
Giambattista Bloisi	73316d8c83	Add jaxb and jaxws dependencies when compiling with spark-34 profile as they are required to run with jdk > 8	2024-05-28 14:14:51 +02:00
Miriam Baglioni	75d5ddb999	Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter	2024-05-27 12:01:28 +02:00
Miriam Baglioni	87c9c61b41	Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring	2024-05-27 12:01:16 +02:00
Miriam Baglioni	b55fed09f8	Update to include a blackList that filters out the results we know are wrongly associated to IE	2024-05-27 12:01:01 +02:00
Claudio Atzori	107d958b89	[org dedup] avoid NPEs in SparkPrepareNewOrgs	2024-05-27 11:59:54 +02:00
Claudio Atzori	3a7a6ecc32	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:45 +02:00
Claudio Atzori	1af4224d3d	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:33 +02:00
Claudio Atzori	0d5bdb2db0	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-27 11:59:02 +02:00
Claudio Atzori	66548e6a83	Merge pull request 'changes in copy script' (#438 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#438	2024-05-27 11:54:03 +02:00
Antonis Lempesis	15b54a345a	added fos lvl4	2024-05-24 13:21:28 +03:00
Lampros Smyrnaios	b48ed6e617	Change configuration in the copy-operation to Impala Cluster: Set the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" parameter to "false".	2024-05-23 16:58:12 +03:00
Lampros Smyrnaios	68322843e2	Small updates to the copy-operation to Impala Cluster: - Add a configuration-"switch" to control whether the script exits upon an error or not. - Allow the script to exit when a table could not be created. - Show the elapsed time for processing each database.	2024-05-23 15:07:49 +03:00
Lampros Smyrnaios	c7b32bbacc	Update CopyDataToImpalaCluster: Update the code of acquiring the entities from Ocean cluster, through hive, in order to optimize the process and account for additional reserved keywords in Impala. Co-authored-by: Antonis Lempesis <antleb@di.uoa.gr>	2024-05-23 13:00:19 +03:00
Giambattista Bloisi	1b2357e10a	Merge pull request 'Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12' (#327 ) from spark34-integration into beta Reviewed-on: D-Net/dnet-hadoop#327	2024-05-23 09:20:28 +02:00
Sandro La Bruzzo	f1fe363b19	merged again from beta (I hope for the last time)	2024-05-22 11:08:52 +02:00
Sandro La Bruzzo	66c1ffc866	merged again from beta (I hope for the last time)	2024-05-22 11:02:46 +02:00
Claudio Atzori	1ea67eba82	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-21 13:48:48 +02:00
Claudio Atzori	f9fb2fef6e	Merge pull request 'Modification of Microsoft Academic Graph Mapping' (#435 ) from mag_only_doi into beta Reviewed-on: D-Net/dnet-hadoop#435	2024-05-21 13:48:42 +02:00
Claudio Atzori	834461ba26	[graph provision]fixed wf definition, revised serialization of the usage counts measures	2024-05-21 13:48:06 +02:00
Sandro La Bruzzo	e8a61d5dd5	removed plugin, use only FileGZip plugin	2024-05-21 13:45:29 +02:00
Sandro La Bruzzo	ca9414b737	Implement multiple node name splitter on GZipCollectorPlugin and all nodes that use XMLIterator. If the splitter name contains is a comma separated values it splits for all the values	2024-05-21 09:11:13 +02:00
Sandro La Bruzzo	032bcc8279	since last beta workflow we decide to introduce in the graph only MAG item with DOI and set them invisible ( this should be the same behaviour of the previous DOIBoost mapping). This commit apply this type of mapping	2024-05-20 09:24:15 +02:00
Sandro La Bruzzo	103e2652b3	merged beta	2024-05-17 14:43:07 +02:00
Sandro La Bruzzo	a87f9ea643	fixed scholexplorer bug	2024-05-17 14:16:43 +02:00
Sandro La Bruzzo	6efab4d88e	fixed scholexplorer bug	2024-05-16 16:19:18 +02:00
Claudio Atzori	92f018d196	[graph provision] fixed path pointing to an intermediate data store in the working directory	2024-05-15 15:39:18 +02:00
Claudio Atzori	0611c81a2f	[graph provision] using Qualifier.classNames to populate the correponsing fields in the JSON payload	2024-05-15 15:33:10 +02:00
Claudio Atzori	1efe7f7e39	[graph provision] upgrade to dhp-schema:6.1.2, included project.oamandatepublications in the JSON payload mapping, fixed serialisation of the usageCounts measures	2024-05-14 12:39:31 +02:00
Claudio Atzori	53e7bb4336	Merge pull request 'rest-collector-plugin-with-retry' (#432 ) from rest-collector-plugin-with-retry into beta Reviewed-on: D-Net/dnet-hadoop#432	2024-05-10 09:02:33 +02:00
Claudio Atzori	f7d56e2ef2	Merge branch 'beta' into rest-collector-plugin-with-retry	2024-05-10 09:02:21 +02:00
Claudio Atzori	c1237ab39e	Merge pull request 'Fixes in Graph Provision' (#434 ) from beta_provision_relation into beta Reviewed-on: D-Net/dnet-hadoop#434	2024-05-09 14:15:05 +02:00
Claudio Atzori	dc3a5858f7	Merge branch 'beta' into beta_provision_relation	2024-05-09 14:14:43 +02:00
Claudio Atzori	55f39f7850	[graph provision] adds the possibility to validate the XML records before storing them via the validateXML parameter	2024-05-09 14:06:04 +02:00
Claudio Atzori	39a2afe8b5	[graph provision] fixed XML serialization of the usage counts measures, renamed workflow actions to better reflect their role	2024-05-09 13:54:42 +02:00
Claudio Atzori	908ed9da7a	Merge pull request 'Various fixes in the stats wf' (#430 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#430	2024-05-08 13:41:02 +02:00
Antonis Lempesis	0cada3cc8f	every step is run in the analytics queue. Hardcoded for now, will make a parameter later	2024-05-08 13:42:53 +03:00
Antonis Lempesis	90a4fb3547	fixed typos	2024-05-08 13:17:58 +03:00
Claudio Atzori	18aa323ee9	cleanup unused classes, adjustments in the oozie wf definition	2024-05-08 11:36:46 +02:00
Claudio Atzori	b4e3389432	fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting	2024-05-07 16:25:17 +02:00
Giambattista Bloisi	711048ceed	PrepareRelationsJob rewritten to use Spark Dataframe API and Windowing functions	2024-05-07 15:44:33 +02:00
Sandro La Bruzzo	db358ad0d2	code formatted	2024-05-02 15:25:57 +02:00
Sandro La Bruzzo	26bf8e763a	merged from beta	2024-05-02 15:20:23 +02:00
Sandro La Bruzzo	a860c57bbc	updated .gitignore	2024-05-02 15:16:00 +02:00
Sandro La Bruzzo	0646d0d064	Updated main sparkApplication to avoid to require master variable	2024-05-02 15:15:03 +02:00
Michele Artini	f4068de298	code reindent + tests	2024-05-02 09:51:33 +02:00
Michele Artini	2615136efc	added a retry mechanism	2024-04-30 11:58:42 +02:00
Sandro La Bruzzo	133ead1e3e	updated new version of scholexplorer Generation	2024-04-29 09:00:30 +02:00
Sandro La Bruzzo	052c6aac9d	formatted code	2024-04-26 16:03:04 +02:00
Sandro La Bruzzo	9cd3bc0f10	Added a new generation of the dump for scholexplorer tested with last version of spark, and strongly refactored	2024-04-26 16:02:07 +02:00
Sandro La Bruzzo	0d628cd62b	merged again from beta	2024-04-23 17:34:55 +02:00
Lampros Smyrnaios	49af2e5740	Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views; overcome some bash-instabilities. - Upon any error, fail the whole process, not just the current DB-creation, as those errors usually indicate a bug in the initial DB-creation, that should be fixed immediately. - Enhance parallel-copy of large files by "hadoop distcp" command. - Reduce the "invalidate metadata" commands to just the current DB's tables, in order to eliminate the general overhead on Impala. - Show the number of tables and views in the logs. - Fix some log-messages.	2024-04-23 17:15:04 +03:00
Antonis Lempesis	d2649a1429	increased the jvm ram	2024-04-23 16:03:16 +03:00
Sandro La Bruzzo	073f320c6a	Added module containing all the dependencies, useful for spark deploy on k8.	2024-04-22 11:32:31 +02:00
Sandro La Bruzzo	b84ad0c06e	merged beta	2024-04-19 14:39:59 +02:00
Antonis Lempesis	b52a5a753b	Merge remote-tracking branch 'upstream/beta' into beta	2024-04-19 15:28:28 +03:00
Sandro La Bruzzo	8dd9cf84e2	code formatted	2024-04-19 12:30:59 +02:00
Sandro La Bruzzo	342cb6189b	fixed problem on changed signature on RowEncoder removed property dhp.schema.artifact	2024-04-19 12:13:26 +02:00
Antonis Lempesis	c3fe9662b2	all indicator tables are now stored as parquet	2024-04-19 12:45:36 +03:00
Antonis Lempesis	0c71c58df6	fixed the definition of gold_oa	2024-04-18 12:01:27 +03:00
Antonis Lempesis	43d05dbebb	fixed the definition of result_country	2024-04-18 11:53:50 +03:00
Antonis Lempesis	e728a0897c	fixed the definition of indi_pub_bronze_oa	2024-04-18 11:07:55 +03:00
Antonis Lempesis	308ae580a9	slight optimization in indi_pub_gold_oa definition	2024-04-18 10:57:52 +03:00
Antonis Lempesis	27d22bd8f9	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:59:52 +03:00
Antonis Lempesis	1f5aba12fa	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:54:23 +03:00
Giambattista Bloisi	613ec5ffce	Add profiles for different spark versions: spark-24, spark-34, spark-35	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	52495f2cd2	used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	8c3e9a09d3	added repository openaire-third-parties	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	2fa78f6071	Changes requires to build and run tests with Java 17	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	326c9dc08c	Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12	2023-12-05 19:11:06 +01:00