[Person]new implementation for the extraction of the coAuthorship relations

[Person]first implementation of the action set to include Person entity in the graph starting from the orcid data
[Person]First implementation to include Person entity in the graph
2024-07-09 12:29:55 +02:00 · 2024-07-04 12:08:46 +02:00 · 2024-06-29 17:13:01 +02:00 · 2024-06-20 12:28:28 +02:00 · 2024-06-19 11:12:15 +02:00 · 2024-06-19 11:11:52 +02:00
138 changed files with 4931 additions and 2303 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
+/dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;

 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
+			mojo.execute();
+			Assertions.assertTrue(false); // not reached
+		} catch (Exception e) {
+			Assertions
+				.assertTrue(
+					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+		}
 	}

 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -70,10 +70,7 @@
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-		</dependency>
+
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@

 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>

 		<dependency>
@ -172,4 +169,23 @@
 		</dependency>
 	</dependencies>

+	<!-- dependencies required on JDK9+ because J2EE has been removed -->
+	<profiles>
+		<profile>
+			<id>spark-34</id>
+			<dependencies>
+				<dependency>
+					<groupId>javax.xml.bind</groupId>
+					<artifactId>jaxb-api</artifactId>
+					<version>2.2.11</version>
+				</dependency>
+				<dependency>
+					<groupId>com.sun.xml.ws</groupId>
+					<artifactId>jaxws-ri</artifactId>
+					<version>2.3.3</version>
+					<type>pom</type>
+				</dependency>
+			</dependencies>
+		</profile>
+	</profiles>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
 	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
 	 *            concept_rec_id = 656930
 	 * @return response code
-	 * @throws IOException
-	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
 		setDepositionId(concept_rec_id, 1);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
-import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
@ -0,0 +1,106 @@
+
+package eu.dnetlib.dhp.schema.oaf.utils;
+
+import java.util.*;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class MergeEntitiesComparator implements Comparator<Oaf> {
+	static final List<String> PID_AUTHORITIES = Arrays
+		.asList(
+			ModelConstants.ARXIV_ID,
+			ModelConstants.PUBMED_CENTRAL_ID,
+			ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
+			ModelConstants.DATACITE_ID,
+			ModelConstants.CROSSREF_ID);
+
+	static final List<String> RESULT_TYPES = Arrays
+		.asList(
+			ModelConstants.ORP_RESULTTYPE_CLASSID,
+			ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
+			ModelConstants.DATASET_RESULTTYPE_CLASSID,
+			ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
+
+	public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
+
+	@Override
+	public int compare(Oaf left, Oaf right) {
+		if (left == null && right == null)
+			return 0;
+		if (left == null)
+			return -1;
+		if (right == null)
+			return 1;
+
+		int res = 0;
+
+		// pid authority
+		int cfp1 = Optional
+			.ofNullable(left.getCollectedfrom())
+			.map(
+				cf -> cf
+					.stream()
+					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
+					.max(Integer::compare)
+					.orElse(-1))
+			.orElse(-1);
+		int cfp2 = Optional
+			.ofNullable(right.getCollectedfrom())
+			.map(
+				cf -> cf
+					.stream()
+					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
+					.max(Integer::compare)
+					.orElse(-1))
+			.orElse(-1);
+
+		if (cfp1 >= 0 && cfp1 > cfp2) {
+			return 1;
+		} else if (cfp2 >= 0 && cfp2 > cfp1) {
+			return -1;
+		}
+
+		// trust
+		if (left.getDataInfo() != null && right.getDataInfo() != null) {
+			res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
+		}
+
+		// result type
+		if (res == 0) {
+			if (left instanceof Result && right instanceof Result) {
+				Result r1 = (Result) left;
+				Result r2 = (Result) right;
+
+				if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
+					if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
+						return -1;
+					}
+				} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
+					return 1;
+				}
+
+				int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
+				int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
+
+				if (rt1 >= 0 && rt1 > rt2) {
+					return 1;
+				} else if (rt2 >= 0 && rt2 > rt1) {
+					return -1;
+				}
+			}
+		}
+
+		// id
+		if (res == 0) {
+			if (left instanceof OafEntity && right instanceof OafEntity) {
+				res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
+			}
+		}
+
+		return res;
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -40,27 +40,12 @@ public class MergeUtils {

 	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
 		boolean checkDelegateAuthority) {
-		TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
-			int res = 0;

-			if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
-				res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
-			}
+		ArrayList<T> sortedEntities = new ArrayList<>();
+		oafEntityIterator.forEachRemaining(sortedEntities::add);
+		sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());

-			if (res == 0) {
-				if (o1 instanceof Result && o2 instanceof Result) {
-					return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
-				}
-			}
-
-			return res;
-		});
-
-		while (oafEntityIterator.hasNext()) {
-			sortedEntities.add(oafEntityIterator.next());
-		}
-
-		Iterator<T> it = sortedEntities.descendingIterator();
+		Iterator<T> it = sortedEntities.iterator();
 		T merged = it.next();

 		while (it.hasNext()) {
@ -143,7 +128,7 @@ public class MergeUtils {
 	 * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
 	 * such version.
 	 * <p>
-	 * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
+	 * Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
 	 * and proceeds with the canonical property merging.
 	 *
 	 * @param left
@ -161,8 +146,9 @@ public class MergeUtils {
 		if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
 			return right;
 		}
+
 		// TODO: raise trust to have preferred fields from one or the other??
-		if (new ResultTypeComparator().compare(left, right) < 0) {
+		if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
 			return mergeResultFields(left, right);
 		} else {
 			return mergeResultFields(right, left);
@ -225,9 +211,9 @@ public class MergeUtils {

 	private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
 		Function<T, K> keyExtractor, BinaryOperator<T> merger) {
-		if (left == null) {
-			return right;
-		} else if (right == null) {
+		if (left == null || left.isEmpty()) {
+			return right != null ? right : new ArrayList<>();
+		} else if (right == null || right.isEmpty()) {
 			return left;
 		}

@ -405,7 +391,7 @@ public class MergeUtils {
 		}

 		// should be an instance attribute, get the first non-null value
-		merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
+		merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));

 		// distinct countries, do not manage datainfo
 		merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
@ -575,6 +561,13 @@ public class MergeUtils {
 		return m != null ? m : e;
 	}

+	private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
+		if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
+			return e;
+		}
+		return m;
+	}
+
 	private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
 		List<List<Author>> authors = new ArrayList<>();
 		if (author != null) {
@ -587,6 +580,10 @@ public class MergeUtils {
 	}

 	private static String instanceKeyExtractor(Instance i) {
+		// three levels of concatenating:
+		// 1. ::
+		// 2. @@
+		// 3. ||
 		return String
 			.join(
 				"::",
@ -594,10 +591,10 @@ public class MergeUtils {
 				kvKeyExtractor(i.getCollectedfrom()),
 				qualifierKeyExtractor(i.getAccessright()),
 				qualifierKeyExtractor(i.getInstancetype()),
-				Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
+				Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
 				Optional
 					.ofNullable(i.getPid())
-					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
+					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
 					.orElse(null));
 	}

@ -706,7 +703,7 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
+			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
@ -1,87 +0,0 @@
-
-package eu.dnetlib.dhp.schema.oaf.utils;
-
-import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
-
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.Optional;
-import java.util.stream.Collectors;
-
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class ResultTypeComparator implements Comparator<Result> {
-
-	public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
-
-	@Override
-	public int compare(Result left, Result right) {
-
-		if (left == null && right == null)
-			return 0;
-		if (left == null)
-			return 1;
-		if (right == null)
-			return -1;
-
-		HashSet<String> lCf = getCollectedFromIds(left);
-		HashSet<String> rCf = getCollectedFromIds(right);
-
-		if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
-			return -1;
-		}
-		if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
-			return 1;
-		}
-
-		if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
-			if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
-				return 0;
-			}
-			return 1;
-		} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
-			return -1;
-		}
-
-		String lClass = left.getResulttype().getClassid();
-		String rClass = right.getResulttype().getClassid();
-
-		if (!lClass.equals(rClass)) {
-			if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
-				return 1;
-
-			if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
-				return -1;
-			if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
-				return 1;
-		}
-
-		// Else (but unlikely), lexicographical ordering will do.
-		return lClass.compareTo(rClass);
-	}
-
-	protected HashSet<String> getCollectedFromIds(Result left) {
-		return Optional
-			.ofNullable(left.getCollectedfrom())
-			.map(
-				cf -> cf
-					.stream()
-					.map(KeyValue::getKey)
-					.collect(Collectors.toCollection(HashSet::new)))
-			.orElse(new HashSet<>());
-	}
-}
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
-      .getOrCreate()
+    if (master != null)
+      b.master(master)
+    b.getOrCreate()
  }

  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }

  def generateScholixResourceFromResult(r: Result): ScholixResource = {
+    val sum = ScholixUtils.resultToSummary(r)
+    if (sum != null)
      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    else
+      null
  }

  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {

  }

+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
-      s.setTypology(Typology.publication)
-    else
-      s.setTypology(Typology.dataset)
+//    s.setTypology(r.getResulttype.getClassid)

    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -59,14 +59,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>com.google.guava</groupId>
-			<artifactId>guava</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.google.code.gson</groupId>
-			<artifactId>gson</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.commons</groupId>
-			<artifactId>commons-math3</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
 		</dependency>
 	</dependencies>

+	<profiles>
+		<profile>
+			<id>spark-24</id>
+			<activation>
+				<activeByDefault>true</activeByDefault>
+			</activation>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-34</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-35</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-35</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+	</profiles>
+
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -1,12 +1,6 @@

 package eu.dnetlib.pace.common;

-import com.google.common.base.Joiner;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
 /**
 * Set of common functions for the framework
 *
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)

  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }

  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    RowEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    ExpressionEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;

 public class UtilTest {

--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <parent>
+    <artifactId>dhp</artifactId>
+    <groupId>eu.dnetlib.dhp</groupId>
+    <version>1.2.5-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <artifactId>dhp-shade-package</artifactId>
+  <description>This module create a jar of all module dependencies</description>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <transformers>
+                <transformer>
+                  <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                </transformer>
+                <transformer />
+                <transformer>
+                  <resource>META-INF/cxf/bus-extensions.txt</resource>
+                </transformer>
+              </transformers>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/maven/**</exclude>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <relocations>
+                <relocation>
+                  <pattern>com</pattern>
+                  <shadedPattern>repackaged.com.google.common</shadedPattern>
+                  <includes>
+                    <include>com.google.common.**</include>
+                  </includes>
+                </relocation>
+              </relocations>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.projectlombok</groupId>
+      <artifactId>lombok</artifactId>
+      <version>1.18.28</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter</artifactId>
+      <version>5.6.1</version>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>junit-jupiter-api</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>junit-jupiter-params</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>junit-jupiter-engine</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>3.3.3</version>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>byte-buddy</artifactId>
+          <groupId>net.bytebuddy</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>byte-buddy-agent</artifactId>
+          <groupId>net.bytebuddy</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-junit-jupiter</artifactId>
+      <version>3.3.3</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <distributionManagement>
+    <site>
+      <id>DHPSite</id>
+      <url>${dhp.site.stage.path}/dhp-common</url>
+    </site>
+  </distributionManagement>
+</project>
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>dhp</artifactId>
+        <version>1.2.5-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+
+    </parent>
+
+    <artifactId>dhp-shade-package</artifactId>
+    <packaging>jar</packaging>
+
+    <distributionManagement>
+        <site>
+            <id>DHPSite</id>
+            <url>${dhp.site.stage.path}/dhp-common</url>
+        </site>
+    </distributionManagement>
+
+    <description>This module create a jar of all module dependencies</description>
+
+
+    <dependencies>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-actionmanager</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-blacklist</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-broker-events</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-dedup-openaire</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-enrichment</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-mapper</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-provision</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-impact-indicators</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-actionsets</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-hist-snaps</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-monitor-irish</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-promote</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-swh</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-raw-data-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-stats-build</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <transformers>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                                </transformer>
+                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/maven/**</exclude>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com</pattern>
+                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
+                                    <includes>
+                                        <include>com.google.common.**</include>
+                                    </includes>
+                                </relocation>
+                            </relocations>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,6 +157,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -125,6 +125,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -103,11 +103,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -156,11 +157,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -95,11 +95,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=10000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -9,6 +9,7 @@ import java.util.List;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
 					.union(openAPCRelations)
 					.union(dataciteRelations)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);

 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -115,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
 			.forEach(
 				l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
 		r.setSubject(sbjs);
-		r
-			.setDataInfo(
-				OafMapperUtils
-					.dataInfo(
-						false, null, true,
-						false,
-						OafMapperUtils
-							.qualifier(
-								ModelConstants.PROVENANCE_ENRICH,
-								null,
-								ModelConstants.DNET_PROVENANCE_ACTIONS,
-								ModelConstants.DNET_PROVENANCE_ACTIONS),
-						null));
+
 		return r;
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
 						s -> sbjs
 							.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
 				r.setSubject(sbjs);
-				r
-					.setDataInfo(
-						OafMapperUtils
-							.dataInfo(
-								false, null, true,
-								false,
-								OafMapperUtils
-									.qualifier(
-										ModelConstants.PROVENANCE_ENRICH,
-										null,
-										ModelConstants.DNET_PROVENANCE_ACTIONS,
-										ModelConstants.DNET_PROVENANCE_ACTIONS),
-								null));
+
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
@ -0,0 +1,80 @@
+
+package eu.dnetlib.dhp.actionmanager.personentity;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Person;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import eu.dnetlib.dhp.utils.DHPUtils;
+
+public class CoAuthorshipIterator implements Iterator<Relation> {
+	private int firstIndex;
+	private int secondIndex;
+	private boolean firstRelation;
+	private List<String> authors;
+	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______::";
+	private static final String OPENAIRE_PREFIX = "openaire____";
+	private static final String SEPARATOR = "::";
+	private static final String ORCID_KEY = "10|" + OPENAIRE_PREFIX + SEPARATOR
+		+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
+	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
+	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
+
+	@Override
+	public boolean hasNext() {
+		return firstIndex < authors.size() - 1;
+	}
+
+	@Override
+	public Relation next() {
+		Relation rel = null;
+		if (firstRelation) {
+			rel = getRelation(authors.get(firstIndex), authors.get(secondIndex));
+			firstRelation = Boolean.FALSE;
+		} else {
+			rel = getRelation(authors.get(secondIndex), authors.get(firstIndex));
+			firstRelation = Boolean.TRUE;
+			secondIndex += 1;
+			if (secondIndex >= authors.size()) {
+				firstIndex += 1;
+				secondIndex = firstIndex + 1;
+			}
+		}
+
+		return rel;
+	}
+
+	public CoAuthorshipIterator(List<String> authors) {
+		this.authors = authors;
+		this.firstIndex = 0;
+		this.secondIndex = 1;
+		this.firstRelation = Boolean.TRUE;
+
+	}
+
+	private Relation getRelation(String orcid1, String orcid2) {
+		String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
+		String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
+		return OafMapperUtils
+			.getRelation(
+				source, target, ModelConstants.PERSON_PERSON_RELTYPE,
+				ModelConstants.PERSON_PERSON_SUBRELTYPE,
+				ModelConstants.PERSON_PERSON_HASCOAUTHORED,
+				Arrays.asList(OafMapperUtils.keyValue(ORCID_KEY, ModelConstants.ORCID_DS)),
+				OafMapperUtils
+					.dataInfo(
+						false, null, false, false,
+						OafMapperUtils
+							.qualifier(
+								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
+								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+						"0.91"),
+				null);
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
@ -0,0 +1,20 @@
+
+package eu.dnetlib.dhp.actionmanager.personentity;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class Coauthors implements Serializable {
+	private List<String> coauthors;
+
+	public List<String> getCoauthors() {
+		return coauthors;
+	}
+
+	public void setCoauthors(List<String> coauthors) {
+		this.coauthors = coauthors;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Couples.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Couples.java
@ -0,0 +1,40 @@
+
+package eu.dnetlib.dhp.actionmanager.personentity;
+
+import java.io.Serializable;
+
+import eu.dnetlib.dhp.schema.oaf.Person;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class Couples implements Serializable {
+	Person p;
+	Relation r;
+
+	public Couples() {
+
+	}
+
+	public Person getP() {
+		return p;
+	}
+
+	public void setP(Person p) {
+		this.p = p;
+	}
+
+	public Relation getR() {
+		return r;
+	}
+
+	public void setR(Relation r) {
+		this.r = r;
+	}
+
+	public static <Tuples> Couples newInstance(Tuple2<Person, Relation> couple) {
+		Couples c = new Couples();
+		c.p = couple._1();
+		c.r = couple._2();
+		return c;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@ -0,0 +1,431 @@
+
+package eu.dnetlib.dhp.actionmanager.personentity;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static org.apache.spark.sql.functions.*;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.*;
+import org.apache.spark.sql.*;
+import org.jetbrains.annotations.NotNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.spark_project.jetty.util.StringUtil;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.Constants;
+import eu.dnetlib.dhp.actionmanager.transformativeagreement.model.TransformativeAgreementModel;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.collection.orcid.model.Author;
+import eu.dnetlib.dhp.collection.orcid.model.Employment;
+import eu.dnetlib.dhp.collection.orcid.model.Work;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Person;
+import eu.dnetlib.dhp.schema.oaf.Pid;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
+import eu.dnetlib.dhp.schema.oaf.utils.PidType;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import scala.Tuple2;
+
+public class ExtractPerson implements Serializable {
+	private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+	private static final String OPENAIRE_PREFIX = "openaire____";
+	private static final String SEPARATOR = "::";
+	private static final String orcidKey = "10|" + OPENAIRE_PREFIX + SEPARATOR
+		+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
+
+	private static final String DOI_PREFIX = "50|doi_________::";
+
+	private static final String PMID_PREFIX = "50|pmid________::";
+	private static final String ARXIV_PREFIX = "50|arXiv_______::";
+
+	private static final String PMCID_PREFIX = "50|pmcid_______::";
+	private static final String ROR_PREFIX = "20|ror_________::";
+	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
+	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
+	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
+
+	public static void main(final String[] args) throws IOException, ParseException {
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					Objects
+						.requireNonNull(
+							ExtractPerson.class
+								.getResourceAsStream(
+									"/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json"))));
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}", outputPath);
+
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
+				createActionSet(spark, inputPath, outputPath, workingDir);
+			});
+
+	}
+
+	private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
+
+		Dataset<Author> authors = spark
+			.read()
+			.parquet(inputPath + "Authors")
+			.as(Encoders.bean(Author.class));
+
+		Dataset<Work> works = spark
+			.read()
+			.parquet(inputPath + "Works")
+			.as(Encoders.bean(Work.class))
+			.filter(
+				(FilterFunction<Work>) w -> Optional.ofNullable(w.getPids()).isPresent() &&
+					w
+						.getPids()
+						.stream()
+						.anyMatch(
+							p -> p.getSchema().equalsIgnoreCase("doi") ||
+								p.getSchema().equalsIgnoreCase("pmc") ||
+								p.getSchema().equalsIgnoreCase("pmid") ||
+								p.getSchema().equalsIgnoreCase("arxiv")));
+
+		Dataset<Employment> employmentDataset = spark
+			.read()
+			.parquet(inputPath + "Employments")
+			.as(Encoders.bean(Employment.class));
+
+		Dataset<Author> peopleToMap = authors
+			.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
+			.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
+			.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
+			.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
+
+		Dataset<Employment> employment = employmentDataset
+			.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
+			.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
+
+		Dataset<Person> people;
+		peopleToMap.map((MapFunction<Author, Person>) op -> {
+			Person person = new Person();
+			person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
+			person
+				.setBiography(
+					Optional
+						.ofNullable(op.getBiography())
+
+						.orElse(""));
+			KeyValue kv = OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS);
+			kv.setDataInfo(null);
+			person.setCollectedfrom(Arrays.asList(kv));
+			person
+				.setAlternativeNames(
+					Optional
+						.ofNullable(op.getOtherNames())
+
+						.orElse(new ArrayList<>()));
+			person
+				.setFamilyName(
+					Optional
+						.ofNullable(op.getFamilyName())
+
+						.orElse(""));
+			person
+				.setGivenName(
+					Optional
+						.ofNullable(op.getGivenName())
+
+						.orElse(""));
+			person
+				.setPid(
+					Optional
+						.ofNullable(op.getOtherPids())
+						.map(
+							v -> v
+								.stream()
+								.map(p -> Pid.newInstance(p.getSchema(), p.getValue()))
+								.collect(Collectors.toList()))
+						.orElse(new ArrayList<>()));
+			person.getPid().add(Pid.newInstance(ModelConstants.ORCID, op.getOrcid()));
+			person.setDateofcollection(op.getLastModifiedDate());
+			person.setOriginalId(Arrays.asList(op.getOrcid()));
+			return person;
+		}, Encoders.bean(Person.class))
+			.write()
+			.option("compression", "gzip")
+			.mode(SaveMode.Overwrite)
+			.json(workingDir + "/people");
+
+		works
+			.flatMap(
+				(FlatMapFunction<Work, Relation>) ExtractPerson::getAuthorshipRelationIterator,
+				Encoders.bean(Relation.class))
+			.write()
+			.option("compression", "gzip")
+			.mode(SaveMode.Overwrite)
+			.json(workingDir + "/authorship");
+
+		Dataset<Relation> coauthorship = works
+			.flatMap((FlatMapFunction<Work, Tuple2<String, String>>) w -> {
+				List<Tuple2<String, String>> lista = new ArrayList<>();
+				w.getPids().stream().forEach(p -> {
+					if (p.getSchema().equalsIgnoreCase("doi") || p.getSchema().equalsIgnoreCase("pmc")
+						|| p.getSchema().equalsIgnoreCase("pmid") || p.getSchema().equalsIgnoreCase("arxiv"))
+						lista.add(new Tuple2<>(p.getValue(), w.getOrcid()));
+				});
+				return lista.iterator();
+			}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
+			.groupByKey((MapFunction<Tuple2<String, String>, String>) Tuple2::_1, Encoders.STRING())
+			.mapGroups(
+				(MapGroupsFunction<String, Tuple2<String, String>, Coauthors>) (k, it) -> extractCoAuthors(it),
+				Encoders.bean(Coauthors.class))
+			.flatMap(
+				(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
+				Encoders.bean(Relation.class))
+			.groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
+			.mapGroups(
+				(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class));
+
+		coauthorship
+			.write()
+			.option("compression", "gzip")
+			.mode(SaveMode.Overwrite)
+			.json(workingDir + "/coauthorship");
+
+		employment
+			.filter((FilterFunction<Employment>) e -> Optional.ofNullable(e.getAffiliationId()).isPresent())
+			.filter((FilterFunction<Employment>) e -> e.getAffiliationId().getSchema().equalsIgnoreCase("ror"))
+			.map(
+				(MapFunction<Employment, Relation>) ExtractPerson::getAffiliationRelation,
+				Encoders.bean(Relation.class))
+			.write()
+			.option("compression", "gzip")
+			.mode(SaveMode.Overwrite)
+			.json(workingDir + "/affiliation");
+
+		people = spark
+			.read()
+			.textFile(workingDir + "/people")
+			.map(
+				(MapFunction<String, Person>) value -> OBJECT_MAPPER
+					.readValue(value, Person.class),
+				Encoders.bean(Person.class));
+
+		people.show(false);
+		people
+			.toJavaRDD()
+			.map(p -> new AtomicAction(p.getClass(), p))
+			.union(
+				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
+			.union(
+				getRelations(spark, workingDir + "/coauthorship")
+					.toJavaRDD()
+					.map(r -> new AtomicAction(r.getClass(), r)))
+			.union(
+				getRelations(spark, workingDir + "/affiliation")
+					.toJavaRDD()
+					.map(r -> new AtomicAction(r.getClass(), r)))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(
+				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
+	}
+
+	private static Dataset<Relation> getRelations(SparkSession spark, String path) {
+		return spark
+			.read()
+			.textFile(path)
+			.map(
+				(MapFunction<String, Relation>) value -> OBJECT_MAPPER
+					.readValue(value, Relation.class),
+				Encoders.bean(Relation.class));// spark.read().json(path).as(Encoders.bean(Relation.class));
+	}
+
+	private static Coauthors extractCoAuthors(Iterator<Tuple2<String, String>> it) {
+		Coauthors coauth = new Coauthors();
+		List<String> coauthors = new ArrayList<>();
+		while (it.hasNext())
+			coauthors.add(it.next()._2());
+		coauth.setCoauthors(coauthors);
+
+		return coauth;
+	}
+
+	private static Relation getAffiliationRelation(Employment row) {
+		String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
+		String target = ROR_PREFIX
+			+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
+		List<KeyValue> properties = new ArrayList<>();
+
+		Relation relation = OafMapperUtils
+			.getRelation(
+				source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
+				ModelConstants.ORG_PERSON_PARTICIPATES,
+				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
+				OafMapperUtils
+					.dataInfo(
+						false, null, false, false,
+						OafMapperUtils
+							.qualifier(
+								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
+								ModelConstants.DNET_PROVENANCE_ACTIONS),
+						"0.91"),
+				null);
+
+		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
+			KeyValue kv = new KeyValue();
+			kv.setKey("startDate");
+			kv.setValue(row.getStartDate());
+			properties.add(kv);
+		}
+		if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
+			KeyValue kv = new KeyValue();
+			kv.setKey("endDate");
+			kv.setValue(row.getEndDate());
+			properties.add(kv);
+		}
+
+		if (properties.size() > 0)
+			relation.setProperties(properties);
+		return relation;
+
+	}
+
+	private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
+		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
+		String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
+
+		return Arrays
+			.asList(
+				OafMapperUtils
+					.getRelation(
+						source, target, ModelConstants.PERSON_PERSON_RELTYPE,
+						ModelConstants.PERSON_PERSON_SUBRELTYPE,
+						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
+						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
+						OafMapperUtils
+							.dataInfo(
+								false, null, false, false,
+								OafMapperUtils
+									.qualifier(
+										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
+										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+								"0.91"),
+						null),
+				OafMapperUtils
+					.getRelation(
+						target, source, ModelConstants.PERSON_PERSON_RELTYPE,
+						ModelConstants.PERSON_PERSON_SUBRELTYPE,
+						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
+						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
+						OafMapperUtils
+							.dataInfo(
+								false, null, false, false,
+								OafMapperUtils
+									.qualifier(
+										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
+										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+								"0.91"),
+						null));
+
+	}
+
+	private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
+
+		if (Optional.ofNullable(w.getPids()).isPresent())
+			return w
+				.getPids()
+				.stream()
+				.map(pid -> getRelation(w.getOrcid(), pid))
+				.filter(Objects::nonNull)
+				.collect(Collectors.toList())
+				.iterator();
+		List<Relation> ret = new ArrayList<>();
+		return ret.iterator();
+	}
+
+	private static Relation getRelation(String orcid, eu.dnetlib.dhp.collection.orcid.model.Pid pid) {
+		String target;
+		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
+		switch (pid.getSchema()) {
+			case "doi":
+				target = DOI_PREFIX
+					+ IdentifierFactory
+						.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), pid.getValue()));
+				break;
+			case "pmid":
+				target = PMID_PREFIX
+					+ IdentifierFactory
+						.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pid.getValue()));
+				break;
+			case "arxiv":
+				target = ARXIV_PREFIX
+					+ IdentifierFactory
+						.md5(PidCleaner.normalizePidValue(PidType.arXiv.toString(), pid.getValue()));
+				break;
+			case "pmcid":
+				target = PMCID_PREFIX
+					+ IdentifierFactory
+						.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), pid.getValue()));
+				break;
+
+			default:
+				return null;
+		}
+
+		return OafMapperUtils
+			.getRelation(
+				source, target, ModelConstants.RESULT_PERSON_RELTYPE,
+				ModelConstants.RESULT_PERSON_SUBRELTYPE,
+				ModelConstants.RESULT_PERSON_HASAUTHORED,
+				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
+				OafMapperUtils
+					.dataInfo(
+						false, null, false, false,
+						OafMapperUtils
+							.qualifier(
+								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
+								ModelConstants.DNET_PROVENANCE_ACTIONS),
+						"0.91"),
+				null);
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/WorkList.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/WorkList.java
@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.actionmanager.personentity;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
+
+import eu.dnetlib.dhp.collection.orcid.model.Work;
+
+public class WorkList implements Serializable {
+	private ArrayList<Work> workArrayList;
+
+	public ArrayList<Work> getWorkArrayList() {
+		return workArrayList;
+	}
+
+	public void setWorkArrayList(ArrayList<Work> workArrayList) {
+		this.workArrayList = workArrayList;
+	}
+
+	public WorkList() {
+		workArrayList = new ArrayList<>();
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.types.StructType;
@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

+		final String blackListInputPath = parser.get("blackListPath");
+		log.info("blackListInputPath: {}", blackListInputPath);
+
 		SparkConf conf = new SparkConf();

 		runWithSparkSession(
@ -77,25 +81,31 @@ public class CreateActionSetFromWebEntries implements Serializable {
 			isSparkSessionManaged,
 			spark -> {

-				createActionSet(spark, inputPath, outputPath);
+				createActionSet(spark, inputPath, outputPath, blackListInputPath);

 			});
 	}

 	public static void createActionSet(SparkSession spark, String inputPath,
-		String outputPath) {
+		String outputPath, String blackListInputPath) {

 		final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
-			.filter("publication_year <= 2020 or country_code=='IE'")
+			.filter("country_code=='IE'")
 			.drop("publication_year");

-		dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
+		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
+
+		dataset
+			.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
+			.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
+			.drop("OpenAlexId")
+			.flatMap((FlatMapFunction<Row, Relation>) row -> {
 				List<Relation> ret = new ArrayList<>();
 				final String ror = ROR_PREFIX
 					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
 				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
-			ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
-			ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+//				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+//				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));

 				return ret
 					.iterator();
@ -129,11 +139,26 @@ public class CreateActionSetFromWebEntries implements Serializable {
 				"institution", functions
 					.explode(
 						functions.col("institutions")))
+
 			.selectExpr(
-				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
+				"id", "doi", "institution.ror as ror",
 				"institution.country_code as country_code", "publication_year")
 			.distinct();

+//			.selectExpr(
+//				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
+//				"institution.country_code as country_code", "publication_year")
+//			.distinct();
+
+	}
+
+	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
+
+		return spark
+			.read()
+			.option("header", true)
+			.csv(inputPath)
+			.select("OpenAlexId");
 	}

 	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java
@ -20,6 +20,9 @@ public class Author extends ORCIDItem {

 	private String lastModifiedDate;

+	public Author() {
+	}
+
 	public String getBiography() {
 		return biography;
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java
@ -11,4 +11,7 @@ public class ORCIDItem {
 	public void setOrcid(String orcid) {
 		this.orcid = orcid;
 	}
+
+	public ORCIDItem() {
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java
@ -32,4 +32,6 @@ public class Work extends ORCIDItem {
 		pids.add(pid);
 	}

+	public Work() {
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -1,6 +1,7 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;

 import org.apache.commons.lang3.StringUtils;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
+		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+		Gson gson = new Gson();
+		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
-		if (StringUtils.isBlank(queryParams)) {
-			throw new CollectorException("Param 'queryParams' is null or empty");
-		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
+			requestHeaders);

 		return StreamSupport
 			.stream(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -9,8 +9,11 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@ -22,14 +25,14 @@ import javax.xml.xpath.*;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.http.HttpHeaders;
-import org.apache.http.entity.ContentType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;

+import com.google.common.collect.Maps;
+
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;
@ -44,23 +47,28 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 *
 */
 public class RestIterator implements Iterator<String> {
-
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
+	private static final int MAX_ATTEMPTS = 5;

 	private final HttpClientParams clientParams;

-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
+
+	private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+	private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
+		+ ">";

 	private final String baseUrl;
 	private final String resumptionType;
 	private final String resumptionParam;
 	private final String resultFormatValue;
-	private String queryParams;
+	private String queryParams = "";
 	private final int resultSizeValue;
 	private int resumptionInt = 0; // integer resumption token (first record to harvest)
 	private int resultTotal = -1;
-	private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
+	private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
+																			// harvest
 	// or token scanned from results)
 	private InputStream resultStream;
 	private Transformer transformer;
@ -73,7 +81,7 @@ public class RestIterator implements Iterator<String> {
 	private final String querySize;
 	private final String authMethod;
 	private final String authToken;
-	private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
+	private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
 	private int discoverResultSize = 0;
 	private int pagination = 1;
 	/*
@ -83,8 +91,13 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;

-	/** RestIterator class
-	 *  compatible to version 1.3.33
+	/*
+	 * Can be used to set additional request headers, like for content negotiation
+	 */
+	private Map<String, String> requestHeaders;
+
+	/**
+	 * RestIterator class compatible to version 1.3.33
 	 */
 	public RestIterator(
 		final HttpClientParams clientParams,
@ -101,47 +114,56 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
+		final Map<String, String> requestHeaders) {

 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
 		this.resumptionType = resumptionType;
 		this.resumptionParam = resumptionParam;
 		this.resultFormatValue = resultFormatValue;
-		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+		this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
 		this.queryParams = queryParams;
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
+		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();

-		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+			: "";
+		this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
 			: "";
-		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";

 		try {
 			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
-		} catch (Exception e) {
+		} catch (final Exception e) {
 			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
 		}

 		initQueue();
 	}

-	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+	private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
+		final String entityXpath)
 		throws TransformerConfigurationException, XPathExpressionException {
 		final TransformerFactory factory = TransformerFactory.newInstance();
-		transformer = factory.newTransformer();
-		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
-		xpath = XPathFactory.newInstance().newXPath();
-		xprResultTotalPath = xpath.compile(resultTotalXpath);
-		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
-		xprEntity = xpath.compile(entityXpath);
+		this.transformer = factory.newTransformer();
+		this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+		this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+		this.xpath = XPathFactory.newInstance().newXPath();
+		this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
+		this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+		this.xprEntity = this.xpath.compile(entityXpath);
 	}

 	private void initQueue() {
+		if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
+			query = baseUrl;
+		} else {
 			query = baseUrl + "?" + queryParams + querySize + queryFormat;
-		log.info("REST calls starting with {}", query);
+		}
+
+		log.info("REST calls starting with {}", this.query);
 	}

 	private void disconnect() {
@ -154,11 +176,22 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public boolean hasNext() {
-		if (recordQueue.isEmpty() && query.isEmpty()) {
+		synchronized (this.recordQueue) {
+			while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
+				try {
+					this.query = downloadPage(this.query, 0);
+				} catch (final CollectorException e) {
+					log.debug("CollectorPlugin.next()-Exception: {}", e);
+					throw new RuntimeException(e);
+				}
+			}
+
+			if (!this.recordQueue.isEmpty()) {
+				return true;
+			}
+
 			disconnect();
 			return false;
-		} else {
-			return true;
 		}
 	}

@ -168,27 +201,34 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public String next() {
-		synchronized (recordQueue) {
-			while (recordQueue.isEmpty() && !query.isEmpty()) {
-				try {
-					query = downloadPage(query);
-				} catch (CollectorException e) {
-					log.debug("CollectorPlugin.next()-Exception: {}", e);
-					throw new RuntimeException(e);
-				}
-			}
-			return recordQueue.poll();
+		synchronized (this.recordQueue) {
+			return this.recordQueue.poll();
 		}
 	}

 	/*
-	 * download page and return nextQuery
+	 * download page and return nextQuery (with number of attempt)
 	 */
-	private String downloadPage(String query) throws CollectorException {
+	private String downloadPage(String query, final int attempt) throws CollectorException {
+
+		if (attempt > MAX_ATTEMPTS) {
+			throw new CollectorException("Max Number of attempts reached, query:" + query);
+		}
+
+		if (attempt > 0) {
+			final int delay = (attempt * 5000);
+			log.debug("Attempt {} with delay {}", attempt, delay);
+			try {
+				Thread.sleep(delay);
+			} catch (final InterruptedException e) {
+				new CollectorException(e);
+			}
+		}
+
+		try {
 			String resultJson;
-		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+			String resultXml = XML_HEADER;
 			String nextQuery = "";
-		String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
 			Node resultNode = null;
 			NodeList nodeList = null;
 			String qUrlArgument = "";
@ -196,81 +236,96 @@ public class RestIterator implements Iterator<String> {
 			InputStream theHttpInputStream;

 			// check if cursor=* is initial set otherwise add it to the queryParam URL
-		if (resumptionType.equalsIgnoreCase("deep-cursor")) {
+			if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
 				log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
 				if (!query.contains("&cursor=")) {
 					query += "&cursor=*";
 				}
 			}

+			// find pagination page start number in queryParam and remove before start the first query
+			if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
+				&& (query.contains("paginationStart="))) {
+
+				final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
+				m.find(); // guaranteed to be true for this regex
+
+				String[] pageVal = m.group(0).split("=");
+				pagination = Integer.parseInt(pageVal[1]);
+
+				// remove page start number from query and queryParams
+				queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
+				query = query.replaceFirst("&?paginationStart=[0-9]+", "");
+
+			}
+
 			try {
-			log.info("requestig URL [{}]", query);
+				log.info("requesting URL [{}]", query);

-			URL qUrl = new URL(query);
-			log.debug("authMethod: {}", authMethod);
-			if ("bearer".equalsIgnoreCase(this.authMethod)) {
-				log.trace("authMethod before inputStream: {}", resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
-				conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
-				conn.setRequestMethod("GET");
-				theHttpInputStream = conn.getInputStream();
-			} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
-				log.trace("authMethod before inputStream: {}", resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
-				conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
-				conn.setRequestMethod("GET");
-				theHttpInputStream = conn.getInputStream();
-			} else {
-				theHttpInputStream = qUrl.openStream();
+				final URL qUrl = new URL(query);
+				log.debug("authMethod: {}", this.authMethod);
+				if (this.authMethod == "bearer") {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Bearer " + authToken);
+					// requestHeaders.put("Content-Type", "application/json");
+				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Basic " + authToken);
+					// requestHeaders.put("accept", "application/xml");
 				}
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestMethod("GET");
+				this.setRequestHeader(conn);
+				resultStream = conn.getInputStream();

-			resultStream = theHttpInputStream;
-			if ("json".equals(resultOutputFormat)) {
-				resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
+				if ("json".equals(this.resultOutputFormat)) {
+					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
 					resultXml = JsonUtils.convertToXML(resultJson);
-				resultStream = IOUtils.toInputStream(resultXml, UTF_8);
+					this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
 				}

-			if (!(emptyXml).equalsIgnoreCase(resultXml)) {
-				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
-				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
+				if (!isEmptyXml(resultXml)) {
+					resultNode = (Node) this.xpath
+						.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
+					nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
 					log.debug("nodeList.length: {}", nodeList.getLength());
 					for (int i = 0; i < nodeList.getLength(); i++) {
-					StringWriter sw = new StringWriter();
-					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
-					String toEnqueue = sw.toString();
-					if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
-						log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
+						final StringWriter sw = new StringWriter();
+						this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
+						final String toEnqueue = sw.toString();
+						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
+							log
+								.warn(
+									"The following record resulted in empty item for the feeding queue: {}", resultXml);
 						} else {
-						recordQueue.add(sw.toString());
+							this.recordQueue.add(sw.toString());
 						}
 					}
 				} else {
 					log.warn("resultXml is equal with emptyXml");
 				}

-			resumptionInt += resultSizeValue;
+				this.resumptionInt += this.resultSizeValue;

-			switch (resumptionType.toLowerCase()) {
+				switch (this.resumptionType.toLowerCase()) {
 					case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
-					resumptionStr = xprResumptionPath.evaluate(resultNode);
+						this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
 						break;

 					case "count": // begin at one step for all records, iterate over items
-					resumptionStr = Integer.toString(resumptionInt);
+						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;

 					case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
-					if (resultSizeValue < 2) {
+						if (this.resultSizeValue < 2) {
 							throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
 						}
 						qUrlArgument = qUrl.getQuery();
-					String[] arrayQUrlArgument = qUrlArgument.split("&");
-					for (String arrayUrlArgStr : arrayQUrlArgument) {
-						if (arrayUrlArgStr.startsWith(resumptionParam)) {
-							String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+
+						final String[] arrayQUrlArgument = qUrlArgument.split("&");
+						for (final String arrayUrlArgStr : arrayQUrlArgument) {
+							if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
+								final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
 								if (isInteger(resumptionKeyValue[1])) {
 									urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
 									log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
@ -280,61 +335,63 @@ public class RestIterator implements Iterator<String> {
 							}
 						}

-					if (((emptyXml).equalsIgnoreCase(resultXml))
-						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
+						if (isEmptyXml(resultXml)
+							|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
 							// resumptionStr = "";
 							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
-						resultTotal = discoverResultSize;
+							this.resultTotal = this.discoverResultSize;
 						} else {
-						resumptionStr = Integer.toString(resumptionInt);
-						resultTotal = resumptionInt + 1;
+							this.resumptionStr = Integer.toString(this.resumptionInt);
+							this.resultTotal = this.resumptionInt + 1;
 							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
 						}
-					log.info("discoverResultSize: {}", discoverResultSize);
+						log.info("discoverResultSize: {}", this.discoverResultSize);
 						break;

 					case "pagination":
 					case "page": // pagination, iterate over page numbers
-					pagination += 1;
-					if (nodeList != null) {
-						discoverResultSize += nodeList.getLength();
+						if (nodeList != null && nodeList.getLength() > 0) {
+							this.discoverResultSize += nodeList.getLength();
 						} else {
-						resultTotal = discoverResultSize;
-						pagination = discoverResultSize;
+							this.resultTotal = this.discoverResultSize;
+							this.pagination = this.discoverResultSize;
 						}
-					resumptionInt = pagination;
-					resumptionStr = Integer.toString(resumptionInt);
+						this.pagination += 1;
+						this.resumptionInt = this.pagination;
+						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;

-				case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
+					case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
+										// in
 										// solr)
 						// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
 						// deep-cursor, Param 'resultSizeValue' is less than 2");}

-					resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
-					queryParams = queryParams.replace("&cursor=*", "");
+						this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
+						this.queryParams = this.queryParams.replace("&cursor=*", "");

 						// terminating if length of nodeList is 0
-					if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
-						resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
+						if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
+							this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
 						} else {
-						resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
+							this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
+																									// resultSizeValue
 							// because the iteration is over
 							// real length and the
 							// resultSizeValue is added before
 							// the switch()
 						}

-					discoverResultSize = nodeList.getLength();
+						this.discoverResultSize = nodeList.getLength();

 						log
 							.debug(
-							"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
-								+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
+								"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
+									+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);

 						break;

@ -343,28 +400,30 @@ public class RestIterator implements Iterator<String> {
 						break;
 				}

-		} catch (Exception e) {
+			} catch (final Exception e) {
 				log.error(e.getMessage(), e);
 				throw new IllegalStateException("collection failed: " + e.getMessage());
 			}

 			try {
-			if (resultTotal == -1) {
-				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
-				if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
-					resultTotal += 1;
+				if (this.resultTotal == -1) {
+					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
+					if ("page".equalsIgnoreCase(this.resumptionType)
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+						this.resultTotal += 1;
 					} // to correct the upper bound
-				log.info("resultTotal was -1 is now: " + resultTotal);
+					log.info("resultTotal was -1 is now: " + this.resultTotal);
 				}
-		} catch (Exception e) {
+			} catch (final Exception e) {
 				log.error(e.getMessage(), e);
 				throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
 			}
-		log.debug("resultTotal: " + resultTotal);
-		log.debug("resInt: " + resumptionInt);
-		if (resumptionInt <= resultTotal) {
-			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
-				+ queryFormat;
+			log.debug("resultTotal: " + this.resultTotal);
+			log.debug("resInt: " + this.resumptionInt);
+			if (this.resumptionInt <= this.resultTotal) {
+				nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
+					+ this.resumptionStr
+					+ this.queryFormat;
 			} else {
 				nextQuery = "";
 				// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
@ -372,10 +431,18 @@ public class RestIterator implements Iterator<String> {
 			}
 			log.debug("nextQueryUrl: " + nextQuery);
 			return nextQuery;
+		} catch (final Throwable e) {
+			log.warn(e.getMessage(), e);
+			return downloadPage(query, attempt + 1);
+		}

 	}

-	private boolean isInteger(String s) {
+	private boolean isEmptyXml(String s) {
+		return EMPTY_XML.equalsIgnoreCase(s);
+	}
+
+	private boolean isInteger(final String s) {
 		boolean isValidInteger = false;
 		try {
 			Integer.parseInt(s);
@ -383,7 +450,7 @@ public class RestIterator implements Iterator<String> {
 			// s is a valid integer

 			isValidInteger = true;
-		} catch (NumberFormatException ex) {
+		} catch (final NumberFormatException ex) {
 			// s is not an integer
 		}

@ -391,20 +458,36 @@ public class RestIterator implements Iterator<String> {
 	}

 	// Method to encode a string value using `UTF-8` encoding scheme
-	private String encodeValue(String value) {
+	private String encodeValue(final String value) {
 		try {
 			return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
-		} catch (UnsupportedEncodingException ex) {
+		} catch (final UnsupportedEncodingException ex) {
 			throw new RuntimeException(ex.getCause());
 		}
 	}

+	/**
+	 * setRequestHeader
+	 *
+	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+	 * @param conn
+	 */
+	private void setRequestHeader(HttpURLConnection conn) {
+		if (requestHeaders != null) {
+			for (String key : requestHeaders.keySet()) {
+				conn.setRequestProperty(key, requestHeaders.get(key));
+			}
+			log.debug("Set Request Header with: " + requestHeaders);
+		}
+
+	}
+
 	public String getResultFormatValue() {
-		return resultFormatValue;
+		return this.resultFormatValue;
 	}

 	public String getResultOutputFormat() {
-		return resultOutputFormat;
+		return this.resultOutputFormat;
 	}

 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -8,7 +8,10 @@ import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
 import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;

 import javax.xml.stream.XMLEventFactory;
 import javax.xml.stream.XMLEventReader;
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.StartElement;
 import javax.xml.stream.events.XMLEvent;

+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {

 	private String element;

+	private List<String> elements;
+
 	private InputStream inputStream;

 	public XMLIterator(final String element, final InputStream inputStream) {
 		super();
 		this.element = element;
+		if (element.contains(",")) {
+			elements = Arrays
+				.stream(element.split(","))
+				.filter(StringUtils::isNoneBlank)
+				.map(String::toLowerCase)
+				.collect(Collectors.toList());
+		}
 		this.inputStream = inputStream;
 		this.parser = getParser();
+
 		try {
 			this.current = findElement(parser);
 		} catch (XMLStreamException e) {
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
 				final XMLEvent event = parser.nextEvent();

 				// TODO: replace with depth tracking instead of close tag tracking.
-				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+				if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
 					writer.add(event);
 					break;
 				}
@ -142,31 +156,48 @@ public class XMLIterator implements Iterator<String> {
 		XMLEvent peek = parser.peek();
 		if (peek != null && peek.isStartElement()) {
 			String name = peek.asStartElement().getName().getLocalPart();
-			if (element.equals(name)) {
+			if (isCheckTag(name))
 				return peek;
 		}
-		}

 		while (parser.hasNext()) {
-			final XMLEvent event = parser.nextEvent();
+			XMLEvent event = parser.nextEvent();
 			if (event != null && event.isStartElement()) {
 				String name = event.asStartElement().getName().getLocalPart();
-				if (element.equals(name)) {
+				if (isCheckTag(name))
 					return event;
 			}
 		}
-		}
 		return null;
 	}

 	private XMLEventReader getParser() {
 		try {
-			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+			XMLInputFactory xif = inputFactory.get();
+			xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+			return xif.createXMLEventReader(sanitize(inputStream));
 		} catch (XMLStreamException e) {
 			throw new RuntimeException(e);
 		}
 	}

+	private boolean isCheckTag(final String tagName) {
+		if (elements != null) {
+			final String found = elements
+				.stream()
+				.filter(e -> e.equalsIgnoreCase(tagName))
+				.findFirst()
+				.orElse(null);
+			if (found != null)
+				return true;
+		} else {
+			if (element.equalsIgnoreCase(tagName)) {
+				return true;
+			}
+		}
+		return false;
+	}
+
 	private Reader sanitize(final InputStream in) {
 		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
 		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
@ -0,0 +1,25 @@
+[
+  {
+    "paramName": "ip",
+    "paramLongName": "inputPath",
+    "paramDescription": "the zipped opencitations file",
+    "paramRequired": true
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "the working path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  }, {
+  "paramName": "wd",
+  "paramLongName": "workingDir",
+  "paramDescription": "the hdfs name node",
+  "paramRequired": false
+}
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
@ -0,0 +1,2 @@
+inputPath=/data/orcid_2023/tables/
+outputPath=/user/miriam.baglioni/peopleAS
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/config-default.xml
@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>hiveMetastoreUris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hiveJdbcUrl</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
+    </property>
+    <property>
+        <name>hiveDbName</name>
+        <value>openaire</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
@ -0,0 +1,111 @@
+<workflow-app name="PersonEntity" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+
+        <property>
+            <name>inputPath</name>
+            <description>inputPath</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the path where to store the actionset</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+
+        </configuration>
+    </global>
+    <start to="deleteoutputpath"/>
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="deleteoutputpath">
+        <fs>
+            <delete path="${outputPath}"/>
+            <mkdir path="${outputPath}"/>
+            <delete path="${workingDir}"/>
+            <mkdir path="${workingDir}"/>
+        </fs>
+        <ok to="atomicactions"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="atomicactions">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the ActionSet for Person entity and relevant relations</name>
+            <class>eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=4
+                --executor-memory=4G
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=5G
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=15000
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@ -16,5 +16,10 @@
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
-  }
+  },{
+  "paramName": "bl",
+  "paramLongName": "blackListPath",
+  "paramDescription": "the working path",
+  "paramRequired": true
+}
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@ -1,2 +1,3 @@
 sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
 outputPath=/tmp/miriam/webcrawlComplete/
+blackListPath=/user/miriam.baglioni/openalex-blackList
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@ -45,6 +45,7 @@
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--blackListPath</arg><arg>${blackListPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -58,7 +58,7 @@
    "uri": "http://dx.doi.org/10.13039/100010414",
    "name": "Health Research Board",
    "synonym": [
-      "501100001590"
+      "501100001590", "501100023273"
    ]
  },
  {
@ -85,24 +85,6 @@
    "name": "Irish College of General Practitioners",
    "synonym": []
  },
-  {
-    "id": "100012734",
-    "uri": "http://dx.doi.org/10.13039/100012734",
-    "name": "Department for Culture, Heritage and the Gaeltacht, Ireland",
-    "synonym": []
-  },
-  {
-    "id": "100012754",
-    "uri": "http://dx.doi.org/10.13039/100012754",
-    "name": "Horizon Pharma",
-    "synonym": []
-  },
-  {
-    "id": "100012891",
-    "uri": "http://dx.doi.org/10.13039/100012891",
-    "name": "Medical Research Charities Group",
-    "synonym": []
-  },
  {
    "id": "100012919",
    "uri": "http://dx.doi.org/10.13039/100012919",
@ -233,7 +215,7 @@
    "id": "100018064",
    "uri": "http://dx.doi.org/10.13039/100018064",
    "name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
-    "synonym": []
+    "synonym": ["100012734"]
  },
  {
    "id": "100018172",
@ -281,13 +263,13 @@
    "id": "100019637",
    "uri": "http://dx.doi.org/10.13039/100019637",
    "name": "Horizon Therapeutics",
-    "synonym": []
+    "synonym": ["100012754"]
  },
  {
    "id": "100020174",
    "uri": "http://dx.doi.org/10.13039/100020174",
    "name": "Health Research Charities Ireland",
-    "synonym": []
+    "synonym": ["100012891"]
  },
  {
    "id": "100020202",
@ -319,12 +301,7 @@
    "name": "Centre for Ageing Research and Development in Ireland",
    "synonym": []
  },
-  {
-    "id": "501100001583",
-    "uri": "http://dx.doi.org/10.13039/501100001583",
-    "name": "Cystinosis Foundation Ireland",
-    "synonym": []
-  },
+
  {
    "id": "501100001584",
    "uri": "http://dx.doi.org/10.13039/501100001584",
@ -521,7 +498,7 @@
    "id": "501100003037",
    "uri": "http://dx.doi.org/10.13039/501100003037",
    "name": "Elan",
-    "synonym": []
+    "synonym": ["501100021694"]
  },
  {
    "id": "501100003496",
@ -595,17 +572,11 @@
    "name": "Technological University Dublin",
    "synonym": []
  },
-  {
-    "id": "501100009269",
-    "uri": "http://dx.doi.org/10.13039/501100009269",
-    "name": "Programme of Competitive Forestry Research for Development",
-    "synonym": []
-  },
  {
    "id": "501100009315",
    "uri": "http://dx.doi.org/10.13039/501100009315",
    "name": "Cystinosis Ireland",
-    "synonym": []
+    "synonym": ["501100001583"]
  },
  {
    "id": "501100010808",
@ -625,12 +596,6 @@
    "name": "Alimentary Health",
    "synonym": []
  },
-  {
-    "id": "501100011103",
-    "uri": "http://dx.doi.org/10.13039/501100011103",
-    "name": "Rann\u00eds",
-    "synonym": []
-  },
  {
    "id": "501100012354",
    "uri": "http://dx.doi.org/10.13039/501100012354",
@ -733,12 +698,6 @@
    "name": "Insight SFI Research Centre for Data Analytics",
    "synonym": []
  },
-  {
-    "id": "501100021694",
-    "uri": "http://dx.doi.org/10.13039/501100021694",
-    "name": "Elan Pharma International",
-    "synonym": []
-  },
  {
    "id": "501100021838",
    "uri": "http://dx.doi.org/10.13039/501100021838",
@ -769,12 +728,6 @@
    "name": "Institute of Technology, Tralee",
    "synonym": []
  },
-  {
-    "id": "501100023273",
-    "uri": "http://dx.doi.org/10.13039/501100023273",
-    "name": "HRB Clinical Research Facility Galway",
-    "synonym": []
-  },
  {
    "id": "501100023378",
    "uri": "http://dx.doi.org/10.13039/501100023378",
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
+              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@ -79,23 +79,6 @@ object MagUtility extends Serializable {
  private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)

  private val MAGDataInfo: DataInfo = {
-    val di = new DataInfo
-    di.setDeletedbyinference(false)
-    di.setInferred(false)
-    di.setInvisible(false)
-    di.setTrust("0.9")
-    di.setProvenanceaction(
-      OafMapperUtils.qualifier(
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.DNET_PROVENANCE_ACTIONS,
-        ModelConstants.DNET_PROVENANCE_ACTIONS
-      )
-    )
-    di
-  }
-
-  private val MAGDataInfoInvisible: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
@ -453,7 +436,6 @@ object MagUtility extends Serializable {

      case "repository" =>
        result = new Publication()
-        result.setDataInfo(MAGDataInfoInvisible)
        qualifier(
          "0038",
          "Other literature type",
@ -488,7 +470,6 @@ object MagUtility extends Serializable {
    }

    if (result != null) {
-      if (result.getDataInfo == null)
      result.setDataInfo(MAGDataInfo)
      val i = new Instance
      i.setInstancetype(tp)
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
      return null

    result.setCollectedfrom(List(MAGCollectedFrom).asJava)
-    val pidList = List(
+    var pidList = List(
      structuredProperty(
        paper.paperId.get.toString,
        qualifier(
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
      )
    )

-    result.setPid(pidList.asJava)
-
    result.setOriginalId(pidList.map(s => s.getValue).asJava)

    result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@ -618,10 +597,9 @@ object MagUtility extends Serializable {
    }

    val instance = result.getInstance().get(0)
-    instance.setPid(pidList.asJava)
-    if (paper.doi.orNull != null)
-      instance.setAlternateIdentifier(
-        List(
+
+    if (paper.doi.orNull != null) {
+      pidList = pidList ::: List(
        structuredProperty(
          paper.doi.get,
          qualifier(
@ -632,8 +610,10 @@ object MagUtility extends Serializable {
          ),
          null
        )
-        ).asJava
      )
+    }
+    instance.setPid(pidList.asJava)
+    result.setPid(pidList.asJava)
    instance.setUrl(paper.urls.get.asJava)
    instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
    instance.setCollectedfrom(MAGCollectedFrom)
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
    spark.read
      .load(s"$magBasePath/mag_denormalized")
      .as[MAGPaper]
+      .filter(col("doi").isNotNull)
      .map(s => MagUtility.convertMAGtoOAF(s))
      .filter(s => s != null)
      .write
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.pubmed._
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}

-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory

 object SparkCreateBaselineDataFrame {

@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
+        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)

-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
-    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
-    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
-    log.info("outputBasePath: {}", outputBasePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)

    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)

    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )

-    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
-    val mdStoreSize = df.count
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed

 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}

 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
+					"-format",
+					"COCI"
 				});

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/person/CreatePersonAS.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/person/CreatePersonAS.java
@ -0,0 +1,224 @@
+
+package eu.dnetlib.dhp.actionmanager.person;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Optional;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob;
+import eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson;
+import eu.dnetlib.dhp.collection.orcid.model.Author;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Person;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.utils.DHPUtils;
+
+public class CreatePersonAS {
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static SparkSession spark;
+
+	private static Path workingDir;
+	private static final Logger log = LoggerFactory
+		.getLogger(CreatePersonAS.class);
+
+	@BeforeAll
+	public static void beforeAll() throws IOException {
+		workingDir = Files
+			.createTempDirectory(CreatePersonAS.class.getSimpleName());
+		log.info("using work dir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(CreatePersonAS.class.getSimpleName());
+
+		conf.setMaster("local[*]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.codegen.wholeStage", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(CreatePersonAS.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void afterAll() throws IOException {
+		FileUtils.deleteDirectory(workingDir.toFile());
+		spark.stop();
+	}
+
+	@Test
+	void testAuthors() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/person/")
+			.getPath();
+
+//		spark
+//				.read()
+//				.parquet(inputPath + "Authors")
+//				.as(Encoders.bean(Author.class))
+//						.filter((FilterFunction<Author>) a -> Optional.ofNullable(a.getOtherNames()).isPresent() &&
+//								Optional.ofNullable(a.getBiography()).isPresent())
+//								.write()
+//										.mode(SaveMode.Overwrite)
+//												.parquet(workingDir.toString() + "AuthorsSubset");
+
+		ExtractPerson
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet1",
+					"-workingDir",
+					workingDir.toString() + "/working"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> relations = sc
+			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
+			.filter(v -> "eu.dnetlib.dhp.schema.oaf.Relation".equalsIgnoreCase(v._1().toString()))
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+//
+		JavaRDD<Person> people = sc
+			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
+			.filter(v -> "eu.dnetlib.dhp.schema.oaf.Person".equalsIgnoreCase(v._1().toString()))
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Person) aa.getPayload()));
+//
+		Assertions.assertEquals(7, people.count());
+		Assertions
+			.assertEquals(
+				"Paulo",
+				people
+					.filter(
+						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
+					.first()
+					.getGivenName());
+		Assertions
+			.assertEquals(
+				"Tavares",
+				people
+					.filter(
+						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
+					.first()
+					.getFamilyName());
+		Assertions
+			.assertEquals(
+				4,
+				people
+					.filter(
+						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
+					.first()
+					.getAlternativeNames()
+					.size());
+		Assertions
+			.assertEquals(
+				4,
+				people
+					.filter(
+						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
+					.first()
+					.getPid()
+					.size());
+		Assertions
+			.assertTrue(
+				people
+					.filter(
+						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
+					.first()
+					.getPid()
+					.stream()
+					.anyMatch(
+						p -> p.getSchema().equalsIgnoreCase("Scopus Author ID")
+							&& p.getValue().equalsIgnoreCase("15119405200")));
+
+		Assertions
+			.assertEquals(
+				16,
+				relations
+					.filter(r -> r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED))
+					.count());
+		Assertions
+			.assertEquals(
+				14,
+				relations
+					.filter(r -> r.getRelClass().equalsIgnoreCase(ModelConstants.PERSON_PERSON_HASCOAUTHORED))
+					.count());
+		Assertions
+			.assertEquals(
+				3,
+				relations
+					.filter(
+						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
+							&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED))
+					.count());
+		Assertions
+			.assertEquals(
+				2,
+				relations
+					.filter(
+						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
+							&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED)
+							&& r.getTarget().startsWith("50|doi"))
+					.count());
+		Assertions
+			.assertEquals(
+				1,
+				relations
+					.filter(
+						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
+							&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED)
+							&& r.getTarget().startsWith("50|arXiv"))
+					.count());
+
+		Assertions
+			.assertEquals(
+				1,
+				relations
+					.filter(
+						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
+							&& r.getRelClass().equalsIgnoreCase(ModelConstants.PERSON_PERSON_HASCOAUTHORED))
+					.count());
+		Assertions.assertEquals(33, relations.count());
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@ -75,7 +75,11 @@ public class CreateASTest {

 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
+			.getPath();
+		String blackListPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
 			.getPath();

 		CreateActionSetFromWebEntries
@ -86,7 +90,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
+					"-blackListPath", blackListPath
 				});

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -96,7 +101,7 @@ public class CreateASTest {
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));

-		Assertions.assertEquals(64, tmp.count());
+		Assertions.assertEquals(58, tmp.count());

 	}

@ -109,6 +114,10 @@ public class CreateASTest {
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
 			.getPath();
+		String blackListPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+			.getPath();

 		CreateActionSetFromWebEntries
 			.main(
@ -118,7 +127,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
+					"-blackListPath", blackListPath
 				});

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -184,7 +194,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getSource()
@ -197,7 +207,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -210,7 +220,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -224,7 +234,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -238,7 +248,7 @@ public class CreateASTest {

 		Assertions
 			.assertEquals(
-				1, tmp
+				0, tmp
 					.filter(
 						r -> r
 							.getTarget()
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Objects;
+import java.util.stream.Stream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.junit.jupiter.MockitoExtension;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+@ExtendWith(MockitoExtension.class)
+public class FileGZipMultipleNodeTest {
+
+	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
+
+	private final ApiDescriptor api = new ApiDescriptor();
+
+	private FileGZipCollectorPlugin plugin;
+
+	private static final String SPLIT_ON_ELEMENT = "incollection,article";
+
+	@BeforeEach
+	public void setUp() throws IOException {
+
+		final String gzipFile = Objects
+			.requireNonNull(
+				this
+					.getClass()
+					.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
+			.getFile();
+
+		api.setBaseUrl(gzipFile);
+
+		HashMap<String, String> params = new HashMap<>();
+		params.put("splitOnElement", SPLIT_ON_ELEMENT);
+
+		api.setParams(params);
+
+		FileSystem fs = FileSystem.get(new Configuration());
+		plugin = new FileGZipCollectorPlugin(fs);
+	}
+
+	@Test
+	void test() throws CollectorException {
+
+		final Stream<String> stream = plugin.collect(api, new AggregatorReport());
+
+		stream.limit(10).forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			log.info(s);
+		});
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;

 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Stream;

 import org.junit.jupiter.api.Assertions;
@ -35,11 +36,11 @@ public class OsfPreprintCollectorTest {
 	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";

 	private final String resumptionParam = "page";
-	private final String resumptionType = "page";
-	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+	private final String resumptionType = "scan";
+	private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";

-	private final String resultSizeParam = "";
-	private final String resultSizeValue = "";
+	private final String resultSizeParam = "page[size]";
+	private final String resultSizeValue = "100";

 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -69,11 +70,11 @@ public class OsfPreprintCollectorTest {

 	@Test
 	@Disabled
-	void test() throws CollectorException {
+	void test_limited() throws CollectorException {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());

-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Test
+	@Disabled
+	void test_all() throws CollectorException {
+		final AtomicLong i = new AtomicLong(0);
+		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
+
+		stream.forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			if ((i.incrementAndGet() % 1000) == 0) {
+				log.info("COLLECTED: {}", i.get());
+			}
+
+		});
+
+		log.info("TOTAL: {}", i.get());
+		Assertions.assertTrue(i.get() > 0);
+	}
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@ -4,6 +4,11 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@ -25,18 +32,18 @@ class RestCollectorPluginTest {

 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);

-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
-	private final String resumptionType = "count";
-	private final String resumptionParam = "from";
-	private final String entityXpath = "//hits/hits";
-	private final String resumptionXpath = "//hits";
-	private final String resultTotalXpath = "//hits/total";
-	private final String resultFormatParam = "format";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+	private final String resumptionType = "discover";
+	private final String resumptionParam = "skip";
+	private final String entityXpath = "//*[local-name()='data']";
+	private final String resumptionXpath = "";
+	private final String resultTotalXpath = "//*[local-name()='count']";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";

 	private final String protocolDescriptor = "rest_json2xml";
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
+		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");

 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Disabled
+	@Test
+	void testUrl() throws IOException {
+		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+		URL url = new URL(url_s);
+		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestMethod("GET");
+		conn.setRequestProperty("User-Agent", "OpenAIRE");
+		Gson gson = new Gson();
+		System.out.println("Request header");
+		System.out.println(gson.toJson(conn.getHeaderFields()));
+		InputStream inputStream = conn.getInputStream();
+
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@ -44,7 +44,7 @@ public class RestIteratorTest {

 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/person/WorkJson/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/person/WorkJson/part-00000
@ -0,0 +1,10 @@
+{"orcid":"0000-0001-6291-9619","title":"A Visible Light Driven Photoelectrochemical Chloramphenicol Aptasensor Based on a Gold Nanoparticle-Functionalized 3D Flower-like MoS<sub>2</sub>/TiO<sub>2</sub> Heterostructure","pids":[{"value":"10.1021/acs.langmuir.1c02956","schema":"doi"},{"value":"2-s2.0-85124885368","schema":"eid"},{"value":"15205827 07437463","schema":"issn"}]}
+{"orcid":"0000-0002-3210-3034","title":"A Visible Light Driven Photoelectrochemical Chloramphenicol Aptasensor Based on a Gold Nanoparticle-Functionalized 3D Flower-like MoS<sub>2</sub>/TiO<sub>2</sub> Heterostructure","pids":[{"value":"10.1021/acs.langmuir.1c02956","schema":"doi"},{"value":"2-s2.0-85124885368","schema":"eid"},{"value":"15205827 07437463","schema":"issn"}]}
+{"orcid":"0000-0001-6291-9619","title":"Study of High-Transverse-Momentum Higgs Boson Production in Association with a Vector Boson in the <math display=\"inline\"><mrow><mi>q</mi><mi>q</mi><mi>b</mi><mi>b</mi></mrow></math> Final State with the ATLAS Detector","pids":[{"value":"2736741","schema":"other-id"},{"value":"10.1103/PhysRevLett.132.131802","schema":"doi"},{"value":"2312.07605","schema":"arxiv"}]}
+{"orcid":"0000-0002-3210-3034","title":"Study of High-Transverse-Momentum Higgs Boson Production in Association with a Vector Boson in the <math display=\"inline\"><mrow><mi>q</mi><mi>q</mi><mi>b</mi><mi>b</mi></mrow></math> Final State with the ATLAS Detector","pids":[{"value":"2736741","schema":"other-id"},{"value":"10.1103/PhysRevLett.132.131802","schema":"doi"},{"value":"2312.07605","schema":"arxiv"}]}
+{"orcid":"0000-0002-9030-7609","title":"Search for supersymmetry in a final state containing two photons and missing transverse momentum in √s = 13 TeV pp collisions at the LHC using the ATLAS detector","pids":[{"value":"10.1140/epjc/s10052-016-4344-x","schema":"doi"},{"value":"2-s2.0-84988710988","schema":"eid"},{"value":"14346052 14346044","schema":"issn"}]}
+{"orcid":"0000-0003-2552-9691","title":"Search for supersymmetry in a final state containing two photons and missing transverse momentum in $\\sqrt{s}$ = 13 TeV $pp$ collisions at the LHC using the ATLAS detector","pids":[{"value":"1473744","schema":"other-id"},{"value":"10.1140/epjc/s10052-016-4344-x","schema":"doi"},{"value":"1606.09150","schema":"arxiv"}]}
+{"orcid":"0000-0003-0305-8980","title":"Search for supersymmetry in a final state containing two photons and missing transverse momentum in √s = 13 TeV pp collisions at the LHC using the ATLAS detector","pids":[{"value":"10.1140/epjc/s10052-016-4344-x","schema":"doi"},{"value":"2-s2.0-84988710988","schema":"eid"}]}
+{"orcid":"0000-0002-9030-7609","title":"Measurement of the energy response of the ATLAS calorimeter to charged pions from $W^{\\pm }\\rightarrow \\tau ^{\\pm }(\\rightarrow \\pi ^{\\pm }\\nu _{\\tau })\\nu _{\\tau }$ events in Run 2 data","pids":[{"value":"1909507","schema":"other-id"},{"value":"10.1140/epjc/s10052-022-10117-2","schema":"doi"},{"value":"2108.09043","schema":"arxiv"}]}
+{"orcid":"0000-0003-2629-4046","title":"Measurement of the energy response of the ATLAS calorimeter to charged pions from $W^{\\pm }\\rightarrow \\tau ^{\\pm }(\\rightarrow \\pi ^{\\pm }\\nu _{\\tau })\\nu _{\\tau }$ events in Run 2 data","pids":[{"value":"1909507","schema":"other-id"},{"value":"10.1140/epjc/s10052-022-10117-2","schema":"doi"},{"value":"2108.09043","schema":"arxiv"}]}
+{"orcid":"0000-0001-8582-8912","title":"Measurement of the energy response of the ATLAS calorimeter to charged pions from $W^{\\pm }\\rightarrow \\tau ^{\\pm }(\\rightarrow \\pi ^{\\pm }\\nu _{\\tau })\\nu _{\\tau }$ events in Run 2 data","pids":[{"value":"1909507","schema":"other-id"},{"value":"10.1140/epjc/s10052-022-10117-2","schema":"doi"},{"value":"2108.09043","schema":"arxiv"}]}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
-    {
-      "value": "VALUE",
-      "type": "PIPPO"
-    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref

 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }

+  @Test
+  def mappingRecord(): Unit = {
+    val input =
+      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+  }
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test

@ -18,10 +19,8 @@ class MAGMappingTest {
      .master("local[*]")
      .getOrCreate()

-    val s = new SparkMagOrganizationAS(null, null, null)
-
-    s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
-
+    val s = new SparkMAGtoOAF(null, null, null)
+    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
  }

  @Test
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension

 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testEBIData() = {
-    val inputXML = Source
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-      .mkString
-    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }

@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {

-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -53,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
-
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
-            <version>1.0.2</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
-            <version>2.11.0</version>
-        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>com.arakelian</groupId>
-            <artifactId>java-jq</artifactId>
-        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;

@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
-						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));

 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
 					"",
-					r._1()._2().getOriginalId().get(0),
-					r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
-					r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
-					r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
-					r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
-					r._1()._2().getCollectedfrom().get(0).getValue(),
+					Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
+					Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
+					Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
 					"",
 					structuredPropertyListToString(r._1()._2().getPid()),
 					parseECField(r._1()._2().getEclegalbody()),
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 					final Organization o = r._2()._2();
 					return new OrgSimRel(
 						r._1()._1(),
-						o.getOriginalId().get(0),
+						Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
 						Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
 					OrgSimRel orgSimRel = r._1()._2();
-					orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+					orgSimRel
+						.setLocal_id(
+							Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
 					return orgSimRel;
 				},
 				Encoders.bean(OrgSimRel.class));
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;

@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");

-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));

 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -0,0 +1,103 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import scala.Tuple2;
+
+class DatasetMergerTest implements Serializable {
+
+	private List<Tuple2<String, Dataset>> datasets;
+
+	private String testEntityBasePath;
+	private DataInfo dataInfo;
+	private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148";
+	private Dataset dataset_top;
+
+	@BeforeEach
+	public void setUp() throws Exception {
+		testEntityBasePath = Paths
+			.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
+			.toFile()
+			.getAbsolutePath();
+
+		datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class);
+
+		dataset_top = getTopPub(datasets);
+
+		dataInfo = setDI();
+	}
+
+	@Test
+	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
+		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
+
+		// verify id
+		assertEquals(dedupId, pub_merged.getId());
+		assertEquals(2, pub_merged.getInstance().size());
+	}
+
+	public DataInfo setDI() {
+		DataInfo dataInfo = new DataInfo();
+		dataInfo.setTrust("0.9");
+		dataInfo.setDeletedbyinference(false);
+		dataInfo.setInferenceprovenance("testing");
+		dataInfo.setInferred(true);
+		return dataInfo;
+	}
+
+	public Dataset getTopPub(List<Tuple2<String, Dataset>> publications) {
+
+		Double maxTrust = 0.0;
+		Dataset maxPub = new Dataset();
+		for (Tuple2<String, Dataset> publication : publications) {
+			Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
+			if (pubTrust > maxTrust) {
+				maxTrust = pubTrust;
+				maxPub = publication._2();
+			}
+		}
+		return maxPub;
+	}
+
+	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+		List<Tuple2<String, T>> res = new ArrayList<>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(path));
+			String line = reader.readLine();
+			while (line != null) {
+				res
+					.add(
+						new Tuple2<>(
+							MapDocumentUtil.getJPathString("$.id", line),
+							new ObjectMapper().readValue(line, clazz)));
+				// read next line
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+
+		return res;
+	}
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable {
 		assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
 		assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
 		assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
-		assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
-		assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
-		assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
-		assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
+		assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid());
+		assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid());
+		assertEquals("Elsevier BV", pub_merged.getPublisher().getValue());
+		assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue());
 		assertEquals(pub_top.getResourcetype().getClassid(), "");
 		assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
 		assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
-		assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
+		// assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
 		assertEquals(3, pub_merged.getInstance().size());
 		assertEquals(2, pub_merged.getCountry().size());
 		assertEquals(0, pub_merged.getSubject().size());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@ -172,7 +172,7 @@ public class SparkBulkTagJob {
 			.option("compression", "gzip")
 			.json(outputPath + "project");

-		readPath(spark, outputPath + "project", Datasource.class)
+		readPath(spark, outputPath + "project", Project.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -61,7 +61,8 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 					subject.getQualifier().setClassname(vocabulary.getName());
 				}
 			} else {
-				final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
+				final String provenanceActionClassId = Optional
+					.ofNullable(subject.getDataInfo())
 					.map(DataInfo::getProvenanceaction)
 					.map(Qualifier::getClassid)
 					.orElse(null);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 			o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
 			o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
 			o.setCountry(prepareQualifierSplitting(rs.getString("country")));
+			o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology")));
 			o.setDataInfo(info);
 			o.setLastupdatetimestamp(lastUpdateTimestamp);

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -156,6 +156,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -190,6 +191,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -224,6 +226,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -258,6 +261,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -292,6 +296,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -326,6 +331,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -360,6 +366,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -394,6 +401,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -116,17 +116,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=10000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>8000</arg>
+            <arg>--numPartitions</arg><arg>10000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -143,17 +145,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>4000</arg>
+            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -170,11 +174,13 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
@ -197,17 +203,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>300</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -224,17 +232,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=200
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>200</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -251,17 +261,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/organization</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>400</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -278,17 +290,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/project</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -305,17 +319,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/relation</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>10000</arg>
+            <arg>--numPartitions</arg><arg>15000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
@ -45,6 +45,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=15000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -79,6 +80,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=10000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
@ -28,7 +28,8 @@ SELECT
    (array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1]              AS ecinternationalorganization,
    (array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1]                      AS ecenterprise,
    (array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1]                    AS ecsmevalidated,
-    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode
+    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode,
+    org_types.name                                                                                              AS typology
 FROM organizations o
 	LEFT OUTER JOIN acronyms a    ON (a.id = o.id)
 	LEFT OUTER JOIN urls u        ON (u.id = o.id)
@ -37,6 +38,7 @@ FROM organizations o
 	LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
    LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
    LEFT OUTER JOIN other_ids idup  ON (od.id = idup.id)
+    LEFT OUTER JOIN org_types ON (org_types.val = o.type)
 WHERE
    o.status = 'approved' OR o.status = 'suggested'
 GROUP BY
@ -44,4 +46,5 @@ GROUP BY
 	o.name,
 	o.creation_date,
 	o.modification_date,
-	o.country;
+	o.country,
+	org_types.name;
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@ -0,0 +1,5 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@ -0,0 +1,166 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
@ -25,6 +25,22 @@ object SparkApplyHostedByMapToResult {
          val i = p.getInstance().asScala
          if (i.size == 1) {
            val inst: Instance = i.head
+            patchInstance(p, ei, inst)
+
+          } else {
+            val cf = i.map(ii => ii.getCollectedfrom.getValue)
+            if (cf.contains("Crossref")) {
+              i.foreach(ii => {
+                patchInstance(p, ei, ii)
+              })
+            }
+          }
+        }
+        p
+      })(Encoders.bean(classOf[Publication]))
+  }
+
+  private def patchInstance(p: Publication, ei: EntityInfo, inst: Instance): Unit = {
    inst.getHostedby.setKey(ei.getHostedById)
    inst.getHostedby.setValue(ei.getName)
    if (ei.getOpenAccess) {
@ -39,11 +55,6 @@ object SparkApplyHostedByMapToResult {
      inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
      p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
    }
-
-          }
-        }
-        p
-      })(Encoders.bean(classOf[Publication]))
  }

  def main(args: Array[String]): Unit = {
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@ -0,0 +1,258 @@
+package eu.dnetlib.dhp.sx.graph
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.sx.scholix.{
+  Scholix,
+  ScholixCollectedFrom,
+  ScholixEntityId,
+  ScholixIdentifier,
+  ScholixRelationship,
+  ScholixResource
+}
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+import scala.io.Source
+
+case class RelationInfo(
+  source: String,
+  target: String,
+  relclass: String,
+  id: String,
+  collectedfrom: Seq[RelKeyValue]
+) {}
+case class RelKeyValue(key: String, value: String) {}
+
+object ScholexplorerUtils {
+
+  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
+  val mapper = new ObjectMapper()
+
+  case class RelationVocabulary(original: String, inverse: String) {}
+
+  val relations: Map[String, RelationVocabulary] = {
+    val input = Source
+      .fromInputStream(
+        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
+      )
+      .mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+
+    lazy val json: json4s.JValue = parse(input)
+
+    json.extract[Map[String, RelationVocabulary]]
+  }
+
+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
+  def generateDatasourceOpenAIREURLS(id: String): String = {
+    if (id != null && id.length > 12)
+      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
+    else
+      null
+  }
+
+  def findURLForPID(
+    pidValue: List[StructuredProperty],
+    urls: List[String]
+  ): List[(StructuredProperty, String)] = {
+    pidValue.map { p =>
+      val pv = p.getValue
+
+      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+      (p, r.orNull)
+    }
+  }
+
+  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
+    if (r.getInstance() == null || r.getInstance().isEmpty)
+      return List()
+    r.getInstance()
+      .asScala
+      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+      .filter(i => i.getPid != null && i.getUrl != null)
+      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+      .distinct
+      .toList
+  }
+
+  def generateScholixResourceFromResult(result: Result): ScholixResource = {
+
+    if (result.getInstance() == null || result.getInstance().size() == 0)
+      return null
+
+    if (result.getPid == null || result.getPid.isEmpty)
+      return null
+
+    val r = new ScholixResource
+    r.setDnetIdentifier(result.getId)
+
+    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
+    if (persistentIdentifiers.isEmpty)
+      return null
+
+    r.setIdentifier(persistentIdentifiers.asJava)
+
+    r.setObjectType(result.getResulttype.getClassid)
+
+    r.setObjectSubType(
+      result
+        .getInstance()
+        .asScala
+        .filter(i => i != null && i.getInstancetype != null)
+        .map(i => i.getInstancetype.getClassname)
+        .distinct
+        .head
+    )
+
+    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
+      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
+      if (titles.nonEmpty)
+        r.setTitle(titles.head)
+      else
+        return null
+    }
+    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
+      val authors: List[ScholixEntityId] =
+        result.getAuthor.asScala
+          .map(a => {
+            val entity = new ScholixEntityId()
+            entity.setName(a.getFullname)
+            if (a.getPid != null && a.getPid.size() > 0)
+              entity.setIdentifiers(
+                a.getPid.asScala
+                  .map(sp => {
+                    val id = new ScholixIdentifier()
+                    id.setIdentifier(sp.getValue)
+                    id.setSchema(sp.getQualifier.getClassid)
+                    id
+                  })
+                  .take(3)
+                  .toList
+                  .asJava
+              )
+            entity
+          })
+          .toList
+      if (authors.nonEmpty)
+        r.setCreator(authors.asJava)
+
+    }
+
+    val dt: List[String] = result
+      .getInstance()
+      .asScala
+      .filter(i => i.getDateofacceptance != null)
+      .map(i => i.getDateofacceptance.getValue)
+      .toList
+    if (dt.nonEmpty)
+      r.setPublicationDate(dt.distinct.head)
+
+    r.setPublisher(
+      result
+        .getInstance()
+        .asScala
+        .map(i => i.getHostedby)
+        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
+        .map(h => {
+          val eid = new ScholixEntityId()
+          eid.setName(h.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(h.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          eid
+        })
+        .distinct
+        .asJava
+    )
+
+    r.setCollectedFrom(
+      result.getCollectedfrom.asScala
+        .map(cf => {
+          val scf = new ScholixCollectedFrom()
+          scf.setProvisionMode("collected")
+          scf.setCompletionStatus("complete")
+          val eid = new ScholixEntityId()
+          eid.setName(cf.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(cf.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          scf.setProvider(eid)
+          scf
+        })
+        .asJava
+    )
+
+    r
+  }
+
+  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
+    val s: Scholix = new Scholix
+    s.setSource(source)
+    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
+      s.setLinkprovider(
+        relation.collectedfrom
+          .map(cf => {
+            val eid = new ScholixEntityId()
+            eid.setName(cf.value)
+            val id = new ScholixIdentifier()
+            id.setIdentifier(cf.key)
+            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
+            eid.setIdentifiers(List(id).asJava)
+            eid
+          })
+          .toList
+          .asJava
+      )
+    else {
+      val eid = new ScholixEntityId()
+      eid.setName("OpenAIRE")
+      val id = new ScholixIdentifier()
+      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
+      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
+      eid.setIdentifiers(List(id).asJava)
+      s.setLinkprovider(List(eid).asJava)
+    }
+    s.setIdentifier(relation.id)
+    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
+    if (semanticRelation == null)
+      return null
+    s.setRelationship(
+      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+    )
+    s.setPublicationDate(source.getPublicationDate)
+    s.setPublisher(source.getPublisher)
+    val mockTarget = new ScholixResource
+    mockTarget.setDnetIdentifier(relation.target)
+    s.setTarget(mockTarget)
+    s
+  }
+
+  def updateTarget(s: Scholix, t: ScholixResource): String = {
+
+    s.setTarget(t)
+    val spublishers: Seq[ScholixEntityId] =
+      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
+    val tpublishers: Seq[ScholixEntityId] =
+      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
+    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
+    s.setPublisher(mergedPublishers.asJava)
+    mapper.writeValueAsString(s)
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@ -0,0 +1,141 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import eu.dnetlib.dhp.schema.oaf.{
+  KeyValue,
+  OtherResearchProduct,
+  Publication,
+  Relation,
+  Result,
+  Software,
+  Dataset => OafDataset
+}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
+import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
+  override def run(): Unit = {
+    val sourcePath = parser.get("sourcePath")
+    log.info("sourcePath: {}", sourcePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)
+    generateBidirectionalRelations(sourcePath, targetPath, spark)
+    generateScholixResource(sourcePath, targetPath, spark)
+    generateScholix(targetPath, spark)
+  }
+
+  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
+    val entityMap: Map[String, StructType] = Map(
+      "publication"          -> Encoders.bean(classOf[Publication]).schema,
+      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
+      "software"             -> Encoders.bean(classOf[Software]).schema,
+      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
+    )
+
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
+
+    val resDs = spark.emptyDataset[ScholixResource]
+    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
+      println(s"adding ${item._1}")
+      res.union(
+        spark.read
+          .schema(item._2)
+          .json(s"$inputPath/${item._1}")
+          .as[Result]
+          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
+          .filter(s => s != null)
+      )
+    })
+    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
+  }
+
+  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
+    val relSchema = Encoders.bean(classOf[Relation]).schema
+
+    val relDF = spark.read
+      .schema(relSchema)
+      .json(s"$inputPath/relation")
+      .where(
+        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
+        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
+      )
+      .select("source", "target", "collectedfrom", "relClass")
+
+    def invRel: String => String = { s =>
+      ScholexplorerUtils.invRel(s)
+    }
+
+    import org.apache.spark.sql.functions.udf
+    val inverseRelationUDF = udf(invRel)
+    val inverseRelation = relDF.select(
+      col("target").alias("source"),
+      col("source").alias("target"),
+      col("collectedfrom"),
+      inverseRelationUDF(col("relClass")).alias("relClass")
+    )
+
+    val bidRel = inverseRelation
+      .union(relDF)
+      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
+      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
+      .drop("collectedfrom")
+      .withColumnRenamed("cf", "collectedfrom")
+      .groupBy(col("id"))
+      .agg(
+        first("source").alias("source"),
+        first("target").alias("target"),
+        first("relClass").alias("relClass"),
+        first("collectedfrom").alias("collectedfrom")
+      )
+
+    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
+
+  }
+
+  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
+
+    import spark.implicits._
+    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
+    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
+
+    val scholix_one_verse = relations
+      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
+      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
+
+    val resourceTarget = relations
+      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
+      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
+
+    scholix_one_verse
+      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
+      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
+      .write
+      .mode(SaveMode.Overwrite)
+      .option("compression", "gzip")
+      .text(s"$outputPath/scholix")
+  }
+}
+
+object SparkCreateScholexplorerDump {
+  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
+
+  def main(args: Array[String]): Unit = {
+    new SparkCreateScholexplorerDump(
+      log = logger,
+      args = args,
+      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
+    ).initialize().run()
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@ -0,0 +1,26 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
+import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Test
+import org.objenesis.strategy.StdInstantiatorStrategy
+
+class ScholixGenerationTest {
+
+  @Test
+  def generateScholix(): Unit = {
+
+    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
+    val app = new SparkCreateScholexplorerDump(null, null, null)
+//   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+//    app.generateBidirectionalRelations(
+//      "/home/sandro/Downloads/scholix_sample/",
+//      "/home/sandro/Downloads/scholix/",
+//      spark
+//    )
+    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+
+  }
+}
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -18,7 +18,7 @@
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
+                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
@ -59,12 +59,6 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.slf4j</groupId>
-                    <artifactId>slf4j-api</artifactId>
-                </exclusion>
-            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -160,6 +154,26 @@
                    <groupId>org.apache.zookeeper</groupId>
                    <artifactId>zookeeper</artifactId>
                </exclusion>
+                <exclusion>
+                    <artifactId>ant</artifactId>
+                    <groupId>org.apache.ant</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>antlr4-runtime</artifactId>
+                    <groupId>org.antlr</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>woodstox-core</artifactId>
+                    <groupId>com.fasterxml.woodstox</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>log4j</artifactId>
+                    <groupId>*</groupId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.logging.log4j</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
@ -206,5 +220,90 @@

    </dependencies>

+    <profiles>
+        <profile>
+            <id>spark-24</id>
+            <activation>
+                <activeByDefault>true</activeByDefault>
+            </activation>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-3</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+
+        <profile>
+            <id>spark-34</id>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-4</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+
+        <profile>
+            <id>spark-35</id>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-4</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>

 </project>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
@ -31,7 +31,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.oa.provision.XmlConverterJob;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;

@ -48,7 +47,7 @@ public class IrishOaiExporterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
+					IrishOaiExporterJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
 		parser.parseArgument(args);

--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getTitle()
 						.stream()
+						.filter(t -> StringUtils.isNotBlank(t.getValue()))
 						.findFirst()
-						.map(StructuredProperty::getValue)
 						.ifPresent(
-							title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
+							title -> {
+								re.setTitle(title);
+								re
+									.getTitle()
+									.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
+							});
 				}
 				if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
 					result
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
-import static org.apache.spark.sql.functions.*;

 import java.util.List;
 import java.util.Map;
 import java.util.Optional;

 import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.expressions.UserDefinedFunction;
-import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -45,9 +37,9 @@ import scala.Tuple2;
 /**
 * XmlConverterJob converts the JoinedEntities as XML records
 */
-public class XmlConverterJob {
+public class PayloadConverterJob {

-	private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
+	private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);

 	public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";

@ -56,8 +48,8 @@ public class XmlConverterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
+					PayloadConverterJob.class
+						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
 		parser.parseArgument(args);

 		final Boolean isSparkSessionManaged = Optional
@ -72,6 +64,12 @@ public class XmlConverterJob {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

+		final Boolean validateXML = Optional
+			.ofNullable(parser.get("validateXML"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.FALSE);
+		log.info("validateXML: {}", validateXML);
+
 		final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
 		log.info("contextApiBaseUrl: {}", contextApiBaseUrl);

@ -86,18 +84,19 @@ public class XmlConverterJob {

 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			removeOutputDir(spark, outputPath);
-			convertToXml(
+			createPayloads(
 				spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
-				VocabularyGroup.loadVocsFromIS(isLookup));
+				VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
 		});
 	}

-	private static void convertToXml(
+	private static void createPayloads(
 		final SparkSession spark,
 		final String inputPath,
 		final String outputPath,
 		final ContextMapper contextMapper,
-		final VocabularyGroup vocabularies) {
+		final VocabularyGroup vocabularies,
+		final Boolean validateXML) {

 		final XmlRecordFactory recordFactory = new XmlRecordFactory(
 			prepareAccumulators(spark.sparkContext()),
@ -118,7 +117,7 @@ public class XmlConverterJob {
 			.as(Encoders.kryo(JoinedEntity.class))
 			.map(
 				(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
-					recordFactory.build(je),
+					recordFactory.build(je, validateXML),
 					ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
 			.map(
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Miriam Baglioni	c465835061	[Person]new implementation for the extraction of the coAuthorship relations	2024-07-09 12:29:55 +02:00
Miriam Baglioni	ddd20e7f8e	[Person]first implementation of the action set to include Person entity in the graph starting from the orcid data	2024-07-04 12:08:46 +02:00
Miriam Baglioni	67ff783e65	[Person]First implementation to include Person entity in the graph	2024-06-29 17:13:01 +02:00
Miriam Baglioni	d35edac212	[IrishFunderList]make changed according to 9635 comment 20, 21, 22 and 23	2024-06-20 12:28:28 +02:00
Miriam Baglioni	6421f8fece	Merge remote-tracking branch 'origin/beta' into beta	2024-06-19 11:12:15 +02:00
Miriam Baglioni	ac270f795b	[IrishFunderList]make changed according to 9635 comment 14, 15 and 16	2024-06-19 11:11:52 +02:00
Claudio Atzori	dd541f8cf5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster.' (#447 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#447	2024-06-18 15:52:30 +02:00
Lampros Smyrnaios	285416c74e	Merge branch 'beta' into beta	2024-06-18 13:50:38 +02:00
Lampros Smyrnaios	3095047e5e	Miscellaneous updates to the copying operation to Impala Cluster: - Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false". - Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR". - Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep". - Add error-handling for "compute stats" errors.	2024-06-18 14:40:41 +03:00
Antonis Lempesis	0456f1b788	Merge remote-tracking branch 'origin/beta' into beta	2024-06-14 15:11:30 +03:00
Antonis Lempesis	38636942c7	filtering out deletedbyinference and invinsible results from accessroute	2024-06-14 15:11:19 +03:00
Lampros Smyrnaios	d942a1101b	Miscellaneous updates to the copying operation to Impala Cluster: - Show some counts and the elapsed time for various sub-tasks. - Code polishing.	2024-06-14 12:14:38 +03:00
Giambattista Bloisi	9bf2bda1c6	Fix: next returned a null value at end of stream	2024-06-12 13:28:51 +02:00
Giambattista Bloisi	d90cb099b8	Fix for paginationStart parameter management	2024-06-11 20:23:44 +02:00
Claudio Atzori	11fe3a4fe0	[graph resolution] use sparkExecutorMemory to define also the memoryOverhead	2024-06-11 14:21:17 +02:00
Claudio Atzori	a8d68c9d29	avoid NPEs	2024-06-11 14:19:24 +02:00
Miriam Baglioni	8fe934810f	Merge remote-tracking branch 'origin/beta' into beta	2024-06-11 10:28:51 +02:00
Miriam Baglioni	9da006e98c	[SDGFoSActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance	2024-06-11 10:28:32 +02:00
Giambattista Bloisi	85c1eae7e0	Fixes for pagination strategy looping at end of download	2024-06-10 19:03:58 +02:00
Claudio Atzori	b0eba210c0	[actionset promotion] use sparkExecutorMemory to define also the memoryOverhead	2024-06-10 16:15:24 +02:00
Claudio Atzori	3776327a8c	hostedby patching to work with the updated Crossref contents, resolved conflict	2024-06-10 15:24:12 +02:00
Claudio Atzori	0139f23d66	Merge pull request 'organization type from OpenOrgs' (#445 ) from import_openorg_type into beta Reviewed-on: D-Net/dnet-hadoop#445	2024-06-07 12:17:31 +02:00
Michele Artini	c726572418	changed some parameters in OSF test	2024-06-07 12:03:26 +02:00
Claudio Atzori	ec79405cc9	[graph raw] set organization type from openorgs	2024-06-07 11:30:31 +02:00
Miriam Baglioni	1477406ecc	[bulkTag] fixed issue that made project disappear in graph_10_enriched	2024-06-06 10:45:41 +02:00
Claudio Atzori	92c3abd5a4	[graph cleaning] use sparkExecutorMemory to define also the memoryOverhead	2024-06-06 10:44:33 +02:00
Claudio Atzori	ce2364743a	applying changes from PR#442: Fix for missing collectedfrom after dedup	2024-06-06 10:43:43 +02:00
Claudio Atzori	f70dc76b61	minor	2024-06-06 10:43:10 +02:00
Claudio Atzori	73bd1938a5	[graph2hive] use sparkExecutorMemory to define also the memoryOverhead	2024-06-05 12:17:35 +02:00
Claudio Atzori	da5c1e73a4	Merge pull request 'Irish oaipmh exporter' (#443 ) from irish-oaipmh-exporter into beta Reviewed-on: D-Net/dnet-hadoop#443	2024-06-05 10:55:09 +02:00
Claudio Atzori	a02f3f0d2b	code formatting	2024-05-30 10:21:18 +02:00
Alessia Bardi	eadfd8d71d	Merge pull request 'Updated XMLIterator for splitting on different nodes' (#436 ) from dblp_collection_plugin into beta Reviewed-on: D-Net/dnet-hadoop#436	2024-05-29 16:05:06 +02:00
Alessia Bardi	05ee783c07	Merge branch 'beta' into dblp_collection_plugin	2024-05-29 16:04:39 +02:00
Alessia Bardi	fe9fb59c90	Merge pull request 'Rest collector plugin on hadoop supports a new param to pass request headers' (#441 ) from rest-collector-request-header-map into beta Reviewed-on: D-Net/dnet-hadoop#441	2024-05-29 15:54:39 +02:00
Claudio Atzori	c272c4ad68	code formatting	2024-05-29 15:50:07 +02:00
Alessia Bardi	c5f4da16a4	Merge branch 'beta' into rest-collector-request-header-map	2024-05-29 15:46:23 +02:00
Alessia	1b165a14a0	Rest collector plugin on hadoop supports a new param to pass request headers	2024-05-29 15:41:36 +02:00
Michele Artini	e996787be2	OSF test	2024-05-29 15:05:17 +02:00
Claudio Atzori	62716141c5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster' (#440 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#440	2024-05-29 14:34:51 +02:00
Miriam Baglioni	5d85b70e1f	[NOAMI] removed Ireland funder id 501100011103. ticket 9635	2024-05-29 11:55:00 +02:00
Lampros Smyrnaios	e3f28338c1	Miscellaneous updates to the copying operation to Impala Cluster: - Assign the WRITE and EXECUTE permissions to the DBs' HDFS-directories, in order to be able to create tables on top of them, in the Impala Cluster. - Make sure the "copydb" function returns early, when it encounters a fatal error, while respecting the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config.	2024-05-28 17:51:45 +03:00
Giambattista Bloisi	73316d8c83	Add jaxb and jaxws dependencies when compiling with spark-34 profile as they are required to run with jdk > 8	2024-05-28 14:14:51 +02:00
Miriam Baglioni	75d5ddb999	Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter	2024-05-27 12:01:28 +02:00
Miriam Baglioni	87c9c61b41	Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring	2024-05-27 12:01:16 +02:00
Miriam Baglioni	b55fed09f8	Update to include a blackList that filters out the results we know are wrongly associated to IE	2024-05-27 12:01:01 +02:00
Claudio Atzori	107d958b89	[org dedup] avoid NPEs in SparkPrepareNewOrgs	2024-05-27 11:59:54 +02:00
Claudio Atzori	3a7a6ecc32	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:45 +02:00
Claudio Atzori	1af4224d3d	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:33 +02:00
Claudio Atzori	0d5bdb2db0	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-27 11:59:02 +02:00
Claudio Atzori	66548e6a83	Merge pull request 'changes in copy script' (#438 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#438	2024-05-27 11:54:03 +02:00
Antonis Lempesis	15b54a345a	added fos lvl4	2024-05-24 13:21:28 +03:00
Lampros Smyrnaios	b48ed6e617	Change configuration in the copy-operation to Impala Cluster: Set the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" parameter to "false".	2024-05-23 16:58:12 +03:00
Lampros Smyrnaios	68322843e2	Small updates to the copy-operation to Impala Cluster: - Add a configuration-"switch" to control whether the script exits upon an error or not. - Allow the script to exit when a table could not be created. - Show the elapsed time for processing each database.	2024-05-23 15:07:49 +03:00
Lampros Smyrnaios	c7b32bbacc	Update CopyDataToImpalaCluster: Update the code of acquiring the entities from Ocean cluster, through hive, in order to optimize the process and account for additional reserved keywords in Impala. Co-authored-by: Antonis Lempesis <antleb@di.uoa.gr>	2024-05-23 13:00:19 +03:00
Giambattista Bloisi	1b2357e10a	Merge pull request 'Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12' (#327 ) from spark34-integration into beta Reviewed-on: D-Net/dnet-hadoop#327	2024-05-23 09:20:28 +02:00
Sandro La Bruzzo	f1fe363b19	merged again from beta (I hope for the last time)	2024-05-22 11:08:52 +02:00
Sandro La Bruzzo	66c1ffc866	merged again from beta (I hope for the last time)	2024-05-22 11:02:46 +02:00
Claudio Atzori	1ea67eba82	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-21 13:48:48 +02:00
Claudio Atzori	f9fb2fef6e	Merge pull request 'Modification of Microsoft Academic Graph Mapping' (#435 ) from mag_only_doi into beta Reviewed-on: D-Net/dnet-hadoop#435	2024-05-21 13:48:42 +02:00
Claudio Atzori	834461ba26	[graph provision]fixed wf definition, revised serialization of the usage counts measures	2024-05-21 13:48:06 +02:00
Sandro La Bruzzo	e8a61d5dd5	removed plugin, use only FileGZip plugin	2024-05-21 13:45:29 +02:00
Sandro La Bruzzo	ca9414b737	Implement multiple node name splitter on GZipCollectorPlugin and all nodes that use XMLIterator. If the splitter name contains is a comma separated values it splits for all the values	2024-05-21 09:11:13 +02:00
Sandro La Bruzzo	032bcc8279	since last beta workflow we decide to introduce in the graph only MAG item with DOI and set them invisible ( this should be the same behaviour of the previous DOIBoost mapping). This commit apply this type of mapping	2024-05-20 09:24:15 +02:00
Sandro La Bruzzo	103e2652b3	merged beta	2024-05-17 14:43:07 +02:00
Sandro La Bruzzo	a87f9ea643	fixed scholexplorer bug	2024-05-17 14:16:43 +02:00
Sandro La Bruzzo	6efab4d88e	fixed scholexplorer bug	2024-05-16 16:19:18 +02:00
Claudio Atzori	92f018d196	[graph provision] fixed path pointing to an intermediate data store in the working directory	2024-05-15 15:39:18 +02:00
Claudio Atzori	0611c81a2f	[graph provision] using Qualifier.classNames to populate the correponsing fields in the JSON payload	2024-05-15 15:33:10 +02:00
Claudio Atzori	1efe7f7e39	[graph provision] upgrade to dhp-schema:6.1.2, included project.oamandatepublications in the JSON payload mapping, fixed serialisation of the usageCounts measures	2024-05-14 12:39:31 +02:00
Claudio Atzori	53e7bb4336	Merge pull request 'rest-collector-plugin-with-retry' (#432 ) from rest-collector-plugin-with-retry into beta Reviewed-on: D-Net/dnet-hadoop#432	2024-05-10 09:02:33 +02:00
Claudio Atzori	f7d56e2ef2	Merge branch 'beta' into rest-collector-plugin-with-retry	2024-05-10 09:02:21 +02:00
Claudio Atzori	c1237ab39e	Merge pull request 'Fixes in Graph Provision' (#434 ) from beta_provision_relation into beta Reviewed-on: D-Net/dnet-hadoop#434	2024-05-09 14:15:05 +02:00
Claudio Atzori	dc3a5858f7	Merge branch 'beta' into beta_provision_relation	2024-05-09 14:14:43 +02:00
Claudio Atzori	55f39f7850	[graph provision] adds the possibility to validate the XML records before storing them via the validateXML parameter	2024-05-09 14:06:04 +02:00
Claudio Atzori	39a2afe8b5	[graph provision] fixed XML serialization of the usage counts measures, renamed workflow actions to better reflect their role	2024-05-09 13:54:42 +02:00
Claudio Atzori	908ed9da7a	Merge pull request 'Various fixes in the stats wf' (#430 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#430	2024-05-08 13:41:02 +02:00
Antonis Lempesis	0cada3cc8f	every step is run in the analytics queue. Hardcoded for now, will make a parameter later	2024-05-08 13:42:53 +03:00
Antonis Lempesis	90a4fb3547	fixed typos	2024-05-08 13:17:58 +03:00
Claudio Atzori	18aa323ee9	cleanup unused classes, adjustments in the oozie wf definition	2024-05-08 11:36:46 +02:00
Claudio Atzori	b4e3389432	fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting	2024-05-07 16:25:17 +02:00
Giambattista Bloisi	711048ceed	PrepareRelationsJob rewritten to use Spark Dataframe API and Windowing functions	2024-05-07 15:44:33 +02:00
Sandro La Bruzzo	db358ad0d2	code formatted	2024-05-02 15:25:57 +02:00
Sandro La Bruzzo	26bf8e763a	merged from beta	2024-05-02 15:20:23 +02:00
Sandro La Bruzzo	a860c57bbc	updated .gitignore	2024-05-02 15:16:00 +02:00
Sandro La Bruzzo	0646d0d064	Updated main sparkApplication to avoid to require master variable	2024-05-02 15:15:03 +02:00
Michele Artini	f4068de298	code reindent + tests	2024-05-02 09:51:33 +02:00
Michele Artini	2615136efc	added a retry mechanism	2024-04-30 11:58:42 +02:00
Sandro La Bruzzo	133ead1e3e	updated new version of scholexplorer Generation	2024-04-29 09:00:30 +02:00
Sandro La Bruzzo	052c6aac9d	formatted code	2024-04-26 16:03:04 +02:00
Sandro La Bruzzo	9cd3bc0f10	Added a new generation of the dump for scholexplorer tested with last version of spark, and strongly refactored	2024-04-26 16:02:07 +02:00
Sandro La Bruzzo	0d628cd62b	merged again from beta	2024-04-23 17:34:55 +02:00
Lampros Smyrnaios	49af2e5740	Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views; overcome some bash-instabilities. - Upon any error, fail the whole process, not just the current DB-creation, as those errors usually indicate a bug in the initial DB-creation, that should be fixed immediately. - Enhance parallel-copy of large files by "hadoop distcp" command. - Reduce the "invalidate metadata" commands to just the current DB's tables, in order to eliminate the general overhead on Impala. - Show the number of tables and views in the logs. - Fix some log-messages.	2024-04-23 17:15:04 +03:00
Antonis Lempesis	d2649a1429	increased the jvm ram	2024-04-23 16:03:16 +03:00
Sandro La Bruzzo	073f320c6a	Added module containing all the dependencies, useful for spark deploy on k8.	2024-04-22 11:32:31 +02:00
Sandro La Bruzzo	b84ad0c06e	merged beta	2024-04-19 14:39:59 +02:00
Antonis Lempesis	b52a5a753b	Merge remote-tracking branch 'upstream/beta' into beta	2024-04-19 15:28:28 +03:00
Sandro La Bruzzo	8dd9cf84e2	code formatted	2024-04-19 12:30:59 +02:00
Sandro La Bruzzo	342cb6189b	fixed problem on changed signature on RowEncoder removed property dhp.schema.artifact	2024-04-19 12:13:26 +02:00
Antonis Lempesis	c3fe9662b2	all indicator tables are now stored as parquet	2024-04-19 12:45:36 +03:00
Antonis Lempesis	0c71c58df6	fixed the definition of gold_oa	2024-04-18 12:01:27 +03:00
Antonis Lempesis	43d05dbebb	fixed the definition of result_country	2024-04-18 11:53:50 +03:00
Antonis Lempesis	e728a0897c	fixed the definition of indi_pub_bronze_oa	2024-04-18 11:07:55 +03:00
Antonis Lempesis	308ae580a9	slight optimization in indi_pub_gold_oa definition	2024-04-18 10:57:52 +03:00
Antonis Lempesis	27d22bd8f9	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:59:52 +03:00
Antonis Lempesis	1f5aba12fa	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:54:23 +03:00
Giambattista Bloisi	613ec5ffce	Add profiles for different spark versions: spark-24, spark-34, spark-35	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	52495f2cd2	used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	8c3e9a09d3	added repository openaire-third-parties	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	2fa78f6071	Changes requires to build and run tests with Java 17	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	326c9dc08c	Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12	2023-12-05 19:11:06 +01:00