[Person]new implementation for the extraction of the coAuthorship relations

[Person]first implementation of the action set to include Person entity in the graph starting from the orcid data
[Person]First implementation to include Person entity in the graph
2024-07-09 12:29:55 +02:00 · 2024-07-04 12:08:46 +02:00 · 2024-06-29 17:13:01 +02:00 · 2024-06-20 12:28:28 +02:00 · 2024-06-19 11:12:15 +02:00 · 2024-06-19 11:11:52 +02:00
138 changed files with 4931 additions and 2303 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
 /dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;
 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
 			mojo.execute();
 			Assertions.assertTrue(false); // not reached
 		} catch (Exception e) {
 			Assertions
 				.assertTrue(
 					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
 						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
 		}
 	}
 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -70,10 +70,7 @@
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
-		<dependency>
+
 			<groupId>org.apache.hadoop</groupId>
 			<artifactId>hadoop-common</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@
 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>
 		<dependency>
@ -172,4 +169,23 @@
 		</dependency>
 	</dependencies>
 	<!-- dependencies required on JDK9+ because J2EE has been removed -->
 	<profiles>
 		<profile>
 			<id>spark-34</id>
 			<dependencies>
 				<dependency>
 					<groupId>javax.xml.bind</groupId>
 					<artifactId>jaxb-api</artifactId>
 					<version>2.2.11</version>
 				</dependency>
 				<dependency>
 					<groupId>com.sun.xml.ws</groupId>
 					<artifactId>jaxws-ri</artifactId>
 					<version>2.3.3</version>
 					<type>pom</type>
 				</dependency>
 			</dependencies>
 		</profile>
 	</profiles>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
 	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
 	 *            concept_rec_id = 656930
 	 * @return response code
 	 * @throws IOException
 	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
 		setDepositionId(concept_rec_id, 1);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
 import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java
@ -0,0 +1,106 @@
 package eu.dnetlib.dhp.schema.oaf.utils;
 import java.util.*;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class MergeEntitiesComparator implements Comparator<Oaf> {
 	static final List<String> PID_AUTHORITIES = Arrays
 		.asList(
 			ModelConstants.ARXIV_ID,
 			ModelConstants.PUBMED_CENTRAL_ID,
 			ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
 			ModelConstants.DATACITE_ID,
 			ModelConstants.CROSSREF_ID);
 	static final List<String> RESULT_TYPES = Arrays
 		.asList(
 			ModelConstants.ORP_RESULTTYPE_CLASSID,
 			ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
 			ModelConstants.DATASET_RESULTTYPE_CLASSID,
 			ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
 	public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
 	@Override
 	public int compare(Oaf left, Oaf right) {
 		if (left == null && right == null)
 			return 0;
 		if (left == null)
 			return -1;
 		if (right == null)
 			return 1;
 		int res = 0;
 		// pid authority
 		int cfp1 = Optional
 			.ofNullable(left.getCollectedfrom())
 			.map(
 				cf -> cf
 					.stream()
 					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
 					.max(Integer::compare)
 					.orElse(-1))
 			.orElse(-1);
 		int cfp2 = Optional
 			.ofNullable(right.getCollectedfrom())
 			.map(
 				cf -> cf
 					.stream()
 					.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
 					.max(Integer::compare)
 					.orElse(-1))
 			.orElse(-1);
 		if (cfp1 >= 0 && cfp1 > cfp2) {
 			return 1;
 		} else if (cfp2 >= 0 && cfp2 > cfp1) {
 			return -1;
 		}
 		// trust
 		if (left.getDataInfo() != null && right.getDataInfo() != null) {
 			res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
 		}
 		// result type
 		if (res == 0) {
 			if (left instanceof Result && right instanceof Result) {
 				Result r1 = (Result) left;
 				Result r2 = (Result) right;
 				if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
 					if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
 						return -1;
 					}
 				} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
 					return 1;
 				}
 				int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
 				int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
 				if (rt1 >= 0 && rt1 > rt2) {
 					return 1;
 				} else if (rt2 >= 0 && rt2 > rt1) {
 					return -1;
 				}
 			}
 		}
 		// id
 		if (res == 0) {
 			if (left instanceof OafEntity && right instanceof OafEntity) {
 				res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
 			}
 		}
 		return res;
 	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -40,27 +40,12 @@ public class MergeUtils {
 	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
 		boolean checkDelegateAuthority) {
 		TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
 			int res = 0;
-			if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
+		ArrayList<T> sortedEntities = new ArrayList<>();
-				res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
+		oafEntityIterator.forEachRemaining(sortedEntities::add);
-			}
+		sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());
-			if (res == 0) {
+		Iterator<T> it = sortedEntities.iterator();
 				if (o1 instanceof Result && o2 instanceof Result) {
 					return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
 				}
 			}
 			return res;
 		});
 		while (oafEntityIterator.hasNext()) {
 			sortedEntities.add(oafEntityIterator.next());
 		}
 		Iterator<T> it = sortedEntities.descendingIterator();
 		T merged = it.next();
 		while (it.hasNext()) {
@ -143,7 +128,7 @@ public class MergeUtils {
 	 * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
 	 * such version.
 	 * <p>
-	 * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
+	 * Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
 	 * and proceeds with the canonical property merging.
 	 *
 	 * @param left
@ -161,8 +146,9 @@ public class MergeUtils {
 		if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
 			return right;
 		}
 		// TODO: raise trust to have preferred fields from one or the other??
-		if (new ResultTypeComparator().compare(left, right) < 0) {
+		if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
 			return mergeResultFields(left, right);
 		} else {
 			return mergeResultFields(right, left);
@ -225,9 +211,9 @@ public class MergeUtils {
 	private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
 		Function<T, K> keyExtractor, BinaryOperator<T> merger) {
-		if (left == null) {
+		if (left == null || left.isEmpty()) {
-			return right;
+			return right != null ? right : new ArrayList<>();
-		} else if (right == null) {
+		} else if (right == null || right.isEmpty()) {
 			return left;
 		}
@ -405,7 +391,7 @@ public class MergeUtils {
 		}
 		// should be an instance attribute, get the first non-null value
-		merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
+		merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));
 		// distinct countries, do not manage datainfo
 		merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
@ -575,6 +561,13 @@ public class MergeUtils {
 		return m != null ? m : e;
 	}
 	private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
 		if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
 			return e;
 		}
 		return m;
 	}
 	private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
 		List<List<Author>> authors = new ArrayList<>();
 		if (author != null) {
@ -587,6 +580,10 @@ public class MergeUtils {
 	}
 	private static String instanceKeyExtractor(Instance i) {
 		// three levels of concatenating:
 		// 1. ::
 		// 2. @@
 		// 3. ||
 		return String
 			.join(
 				"::",
@ -594,10 +591,10 @@ public class MergeUtils {
 				kvKeyExtractor(i.getCollectedfrom()),
 				qualifierKeyExtractor(i.getAccessright()),
 				qualifierKeyExtractor(i.getInstancetype()),
-				Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
+				Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
 				Optional
 					.ofNullable(i.getPid())
-					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
+					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
 					.orElse(null));
 	}
@ -706,7 +703,7 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
+			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java
@ -1,87 +0,0 @@
 package eu.dnetlib.dhp.schema.oaf.utils;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class ResultTypeComparator implements Comparator<Result> {
 	public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
 	@Override
 	public int compare(Result left, Result right) {
 		if (left == null && right == null)
 			return 0;
 		if (left == null)
 			return 1;
 		if (right == null)
 			return -1;
 		HashSet<String> lCf = getCollectedFromIds(left);
 		HashSet<String> rCf = getCollectedFromIds(right);
 		if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
 			return -1;
 		}
 		if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
 			return 1;
 		}
 		if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
 			if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
 				return 0;
 			}
 			return 1;
 		} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
 			return -1;
 		}
 		String lClass = left.getResulttype().getClassid();
 		String rClass = right.getResulttype().getClassid();
 		if (!lClass.equals(rClass)) {
 			if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
 				return 1;
 			if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
 				return 1;
 			if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
 				return 1;
 			if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
 				return -1;
 			if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
 				return 1;
 		}
 		// Else (but unlikely), lexicographical ordering will do.
 		return lClass.compareTo(rClass);
 	}
 	protected HashSet<String> getCollectedFromIds(Result left) {
 		return Optional
 			.ofNullable(left.getCollectedfrom())
 			.map(
 				cf -> cf
 					.stream()
 					.map(KeyValue::getKey)
 					.collect(Collectors.toCollection(HashSet::new)))
 			.orElse(new HashSet<>());
 	}
 }
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
  },
  "isamongtopnsimilardocuments": {
    "original": "IsAmongTopNSimilarDocuments",
    "inverse": "HasAmongTopNSimilarDocuments"
  },
  "hasamongtopnsimilardocuments": {
    "original": "HasAmongTopNSimilarDocuments",
    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
+    if (master != null)
-      .getOrCreate()
+      b.master(master)
    b.getOrCreate()
  }
  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }
  def generateScholixResourceFromResult(r: Result): ScholixResource = {
    val sum = ScholixUtils.resultToSummary(r)
    if (sum != null)
      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
    else
      null
  }
  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
  }
  def invRel(rel: String): String = {
    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
    if (semanticRelation != null)
      semanticRelation.inverse
    else
      null
  }
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
+//    s.setTypology(r.getResulttype.getClassid)
      s.setTypology(Typology.publication)
    else
      s.setTypology(Typology.dataset)
    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -59,14 +59,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.google.guava</groupId>
 			<artifactId>guava</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.google.code.gson</groupId>
 			<artifactId>gson</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-math3</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
 		</dependency>
 	</dependencies>
 	<profiles>
 		<profile>
 			<id>spark-24</id>
 			<activation>
 				<activeByDefault>true</activeByDefault>
 			</activation>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-2</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 		<profile>
 			<id>spark-34</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-2</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 		<profile>
 			<id>spark-35</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-35</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 	</profiles>
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -1,12 +1,6 @@
 package eu.dnetlib.pace.common;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 /**
 * Set of common functions for the framework
 *
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }
  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.pace.util
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.types.StructType
 object SparkCompatUtils {
  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
    RowEncoder(schema)
  }
 }
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.pace.util
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.types.StructType
 object SparkCompatUtils {
  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
    ExpressionEncoder(schema)
  }
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.pace.model.Person;
 import jdk.nashorn.internal.ir.annotations.Ignore;
 public class UtilTest {
--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -0,0 +1,113 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <parent>
    <artifactId>dhp</artifactId>
    <groupId>eu.dnetlib.dhp</groupId>
    <version>1.2.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <artifactId>dhp-shade-package</artifactId>
  <description>This module create a jar of all module dependencies</description>
  <build>
    <plugins>
      <plugin>
        <artifactId>maven-shade-plugin</artifactId>
        <executions>
          <execution>
            <phase>package</phase>
            <goals>
              <goal>shade</goal>
            </goals>
            <configuration>
              <transformers>
                <transformer>
                  <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
                </transformer>
                <transformer />
                <transformer>
                  <resource>META-INF/cxf/bus-extensions.txt</resource>
                </transformer>
              </transformers>
              <filters>
                <filter>
                  <artifact>*:*</artifact>
                  <excludes>
                    <exclude>META-INF/maven/**</exclude>
                    <exclude>META-INF/*.SF</exclude>
                    <exclude>META-INF/*.DSA</exclude>
                    <exclude>META-INF/*.RSA</exclude>
                  </excludes>
                </filter>
              </filters>
              <relocations>
                <relocation>
                  <pattern>com</pattern>
                  <shadedPattern>repackaged.com.google.common</shadedPattern>
                  <includes>
                    <include>com.google.common.**</include>
                  </includes>
                </relocation>
              </relocations>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
  <dependencies>
    <dependency>
      <groupId>org.projectlombok</groupId>
      <artifactId>lombok</artifactId>
      <version>1.18.28</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter</artifactId>
      <version>5.6.1</version>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <artifactId>junit-jupiter-api</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
        <exclusion>
          <artifactId>junit-jupiter-params</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
        <exclusion>
          <artifactId>junit-jupiter-engine</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>org.mockito</groupId>
      <artifactId>mockito-core</artifactId>
      <version>3.3.3</version>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <artifactId>byte-buddy</artifactId>
          <groupId>net.bytebuddy</groupId>
        </exclusion>
        <exclusion>
          <artifactId>byte-buddy-agent</artifactId>
          <groupId>net.bytebuddy</groupId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>org.mockito</groupId>
      <artifactId>mockito-junit-jupiter</artifactId>
      <version>3.3.3</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
  <distributionManagement>
    <site>
      <id>DHPSite</id>
      <url>${dhp.site.stage.path}/dhp-common</url>
    </site>
  </distributionManagement>
 </project>
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp</artifactId>
        <version>1.2.5-SNAPSHOT</version>
        <relativePath>../pom.xml</relativePath>
    </parent>
    <artifactId>dhp-shade-package</artifactId>
    <packaging>jar</packaging>
    <distributionManagement>
        <site>
            <id>DHPSite</id>
            <url>${dhp.site.stage.path}/dhp-common</url>
        </site>
    </distributionManagement>
    <description>This module create a jar of all module dependencies</description>
    <dependencies>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-actionmanager</artifactId>
            <version>${project.version}</version>
        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-aggregation</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-blacklist</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-broker-events</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-dedup-openaire</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-enrichment</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-mapper</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-provision</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-impact-indicators</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-actionsets</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-hist-snaps</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-monitor-irish</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-promote</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-update</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-swh</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-usage-raw-data-update</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-usage-stats-build</artifactId>
            <version>${project.version}</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
                                </transformer>
                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/maven/**</exclude>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <relocations>
                                <relocation>
                                    <pattern>com</pattern>
                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
                                    <includes>
                                        <include>com.google.common.**</include>
                                    </includes>
                                </relocation>
                            </relocations>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,6 +157,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -125,6 +125,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
@ -95,6 +95,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -103,11 +103,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -156,11 +157,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -95,11 +95,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=10000
+                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
@ -103,6 +103,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -155,11 +156,12 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -9,6 +9,7 @@ import java.util.List;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
 					.union(openAPCRelations)
 					.union(dataciteRelations)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -115,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
 			.forEach(
 				l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
 		r.setSubject(sbjs);
-		r
+
 			.setDataInfo(
 				OafMapperUtils
 					.dataInfo(
 						false, null, true,
 						false,
 						OafMapperUtils
 							.qualifier(
 								ModelConstants.PROVENANCE_ENRICH,
 								null,
 								ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						null));
 		return r;
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
 						s -> sbjs
 							.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
 				r.setSubject(sbjs);
-				r
+
 					.setDataInfo(
 						OafMapperUtils
 							.dataInfo(
 								false, null, true,
 								false,
 								OafMapperUtils
 									.qualifier(
 										ModelConstants.PROVENANCE_ENRICH,
 										null,
 										ModelConstants.DNET_PROVENANCE_ACTIONS,
 										ModelConstants.DNET_PROVENANCE_ACTIONS),
 								null));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
@ -0,0 +1,80 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.Person;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.utils.DHPUtils;
 public class CoAuthorshipIterator implements Iterator<Relation> {
 	private int firstIndex;
 	private int secondIndex;
 	private boolean firstRelation;
 	private List<String> authors;
 	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______::";
 	private static final String OPENAIRE_PREFIX = "openaire____";
 	private static final String SEPARATOR = "::";
 	private static final String ORCID_KEY = "10|" + OPENAIRE_PREFIX + SEPARATOR
 		+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
 	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
 	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
 	@Override
 	public boolean hasNext() {
 		return firstIndex < authors.size() - 1;
 	}
 	@Override
 	public Relation next() {
 		Relation rel = null;
 		if (firstRelation) {
 			rel = getRelation(authors.get(firstIndex), authors.get(secondIndex));
 			firstRelation = Boolean.FALSE;
 		} else {
 			rel = getRelation(authors.get(secondIndex), authors.get(firstIndex));
 			firstRelation = Boolean.TRUE;
 			secondIndex += 1;
 			if (secondIndex >= authors.size()) {
 				firstIndex += 1;
 				secondIndex = firstIndex + 1;
 			}
 		}
 		return rel;
 	}
 	public CoAuthorshipIterator(List<String> authors) {
 		this.authors = authors;
 		this.firstIndex = 0;
 		this.secondIndex = 1;
 		this.firstRelation = Boolean.TRUE;
 	}
 	private Relation getRelation(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
 		return OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 				ModelConstants.PERSON_PERSON_SUBRELTYPE,
 				ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 				Arrays.asList(OafMapperUtils.keyValue(ORCID_KEY, ModelConstants.ORCID_DS)),
 				OafMapperUtils
 					.dataInfo(
 						false, null, false, false,
 						OafMapperUtils
 							.qualifier(
 								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
 								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
@ -0,0 +1,20 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class Coauthors implements Serializable {
 	private List<String> coauthors;
 	public List<String> getCoauthors() {
 		return coauthors;
 	}
 	public void setCoauthors(List<String> coauthors) {
 		this.coauthors = coauthors;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Couples.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Couples.java
@ -0,0 +1,40 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import java.io.Serializable;
 import eu.dnetlib.dhp.schema.oaf.Person;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import scala.Tuple2;
 public class Couples implements Serializable {
 	Person p;
 	Relation r;
 	public Couples() {
 	}
 	public Person getP() {
 		return p;
 	}
 	public void setP(Person p) {
 		this.p = p;
 	}
 	public Relation getR() {
 		return r;
 	}
 	public void setR(Relation r) {
 		this.r = r;
 	}
 	public static <Tuples> Couples newInstance(Tuple2<Person, Relation> couple) {
 		Couples c = new Couples();
 		c.p = couple._1();
 		c.r = couple._2();
 		return c;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@ -0,0 +1,431 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static org.apache.spark.sql.functions.*;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.*;
 import org.apache.spark.sql.*;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.spark_project.jetty.util.StringUtil;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.actionmanager.Constants;
 import eu.dnetlib.dhp.actionmanager.transformativeagreement.model.TransformativeAgreementModel;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.collection.orcid.model.Author;
 import eu.dnetlib.dhp.collection.orcid.model.Employment;
 import eu.dnetlib.dhp.collection.orcid.model.Work;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Person;
 import eu.dnetlib.dhp.schema.oaf.Pid;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
 import eu.dnetlib.dhp.schema.oaf.utils.PidType;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import scala.Tuple2;
 public class ExtractPerson implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String OPENAIRE_PREFIX = "openaire____";
 	private static final String SEPARATOR = "::";
 	private static final String orcidKey = "10|" + OPENAIRE_PREFIX + SEPARATOR
 		+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
 	private static final String DOI_PREFIX = "50|doi_________::";
 	private static final String PMID_PREFIX = "50|pmid________::";
 	private static final String ARXIV_PREFIX = "50|arXiv_______::";
 	private static final String PMCID_PREFIX = "50|pmcid_______::";
 	private static final String ROR_PREFIX = "20|ror_________::";
 	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
 	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
 	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
 	public static void main(final String[] args) throws IOException, ParseException {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					Objects
 						.requireNonNull(
 							ExtractPerson.class
 								.getResourceAsStream(
 									"/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json"))));
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath {}", inputPath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}", outputPath);
 		final String workingDir = parser.get("workingDir");
 		log.info("workingDir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
 				createActionSet(spark, inputPath, outputPath, workingDir);
 			});
 	}
 	private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
 		Dataset<Author> authors = spark
 			.read()
 			.parquet(inputPath + "Authors")
 			.as(Encoders.bean(Author.class));
 		Dataset<Work> works = spark
 			.read()
 			.parquet(inputPath + "Works")
 			.as(Encoders.bean(Work.class))
 			.filter(
 				(FilterFunction<Work>) w -> Optional.ofNullable(w.getPids()).isPresent() &&
 					w
 						.getPids()
 						.stream()
 						.anyMatch(
 							p -> p.getSchema().equalsIgnoreCase("doi") ||
 								p.getSchema().equalsIgnoreCase("pmc") ||
 								p.getSchema().equalsIgnoreCase("pmid") ||
 								p.getSchema().equalsIgnoreCase("arxiv")));
 		Dataset<Employment> employmentDataset = spark
 			.read()
 			.parquet(inputPath + "Employments")
 			.as(Encoders.bean(Employment.class));
 		Dataset<Author> peopleToMap = authors
 			.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
 			.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
 			.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
 		Dataset<Employment> employment = employmentDataset
 			.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
 			.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
 		Dataset<Person> people;
 		peopleToMap.map((MapFunction<Author, Person>) op -> {
 			Person person = new Person();
 			person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
 			person
 				.setBiography(
 					Optional
 						.ofNullable(op.getBiography())
 						.orElse(""));
 			KeyValue kv = OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS);
 			kv.setDataInfo(null);
 			person.setCollectedfrom(Arrays.asList(kv));
 			person
 				.setAlternativeNames(
 					Optional
 						.ofNullable(op.getOtherNames())
 						.orElse(new ArrayList<>()));
 			person
 				.setFamilyName(
 					Optional
 						.ofNullable(op.getFamilyName())
 						.orElse(""));
 			person
 				.setGivenName(
 					Optional
 						.ofNullable(op.getGivenName())
 						.orElse(""));
 			person
 				.setPid(
 					Optional
 						.ofNullable(op.getOtherPids())
 						.map(
 							v -> v
 								.stream()
 								.map(p -> Pid.newInstance(p.getSchema(), p.getValue()))
 								.collect(Collectors.toList()))
 						.orElse(new ArrayList<>()));
 			person.getPid().add(Pid.newInstance(ModelConstants.ORCID, op.getOrcid()));
 			person.setDateofcollection(op.getLastModifiedDate());
 			person.setOriginalId(Arrays.asList(op.getOrcid()));
 			return person;
 		}, Encoders.bean(Person.class))
 			.write()
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/people");
 		works
 			.flatMap(
 				(FlatMapFunction<Work, Relation>) ExtractPerson::getAuthorshipRelationIterator,
 				Encoders.bean(Relation.class))
 			.write()
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/authorship");
 		Dataset<Relation> coauthorship = works
 			.flatMap((FlatMapFunction<Work, Tuple2<String, String>>) w -> {
 				List<Tuple2<String, String>> lista = new ArrayList<>();
 				w.getPids().stream().forEach(p -> {
 					if (p.getSchema().equalsIgnoreCase("doi") || p.getSchema().equalsIgnoreCase("pmc")
 						|| p.getSchema().equalsIgnoreCase("pmid") || p.getSchema().equalsIgnoreCase("arxiv"))
 						lista.add(new Tuple2<>(p.getValue(), w.getOrcid()));
 				});
 				return lista.iterator();
 			}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
 			.groupByKey((MapFunction<Tuple2<String, String>, String>) Tuple2::_1, Encoders.STRING())
 			.mapGroups(
 				(MapGroupsFunction<String, Tuple2<String, String>, Coauthors>) (k, it) -> extractCoAuthors(it),
 				Encoders.bean(Coauthors.class))
 			.flatMap(
 				(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
 				Encoders.bean(Relation.class))
 			.groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
 			.mapGroups(
 				(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class));
 		coauthorship
 			.write()
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/coauthorship");
 		employment
 			.filter((FilterFunction<Employment>) e -> Optional.ofNullable(e.getAffiliationId()).isPresent())
 			.filter((FilterFunction<Employment>) e -> e.getAffiliationId().getSchema().equalsIgnoreCase("ror"))
 			.map(
 				(MapFunction<Employment, Relation>) ExtractPerson::getAffiliationRelation,
 				Encoders.bean(Relation.class))
 			.write()
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/affiliation");
 		people = spark
 			.read()
 			.textFile(workingDir + "/people")
 			.map(
 				(MapFunction<String, Person>) value -> OBJECT_MAPPER
 					.readValue(value, Person.class),
 				Encoders.bean(Person.class));
 		people.show(false);
 		people
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.union(
 				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/coauthorship")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/affiliation")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(
 				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 	}
 	private static Dataset<Relation> getRelations(SparkSession spark, String path) {
 		return spark
 			.read()
 			.textFile(path)
 			.map(
 				(MapFunction<String, Relation>) value -> OBJECT_MAPPER
 					.readValue(value, Relation.class),
 				Encoders.bean(Relation.class));// spark.read().json(path).as(Encoders.bean(Relation.class));
 	}
 	private static Coauthors extractCoAuthors(Iterator<Tuple2<String, String>> it) {
 		Coauthors coauth = new Coauthors();
 		List<String> coauthors = new ArrayList<>();
 		while (it.hasNext())
 			coauthors.add(it.next()._2());
 		coauth.setCoauthors(coauthors);
 		return coauth;
 	}
 	private static Relation getAffiliationRelation(Employment row) {
 		String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
 		String target = ROR_PREFIX
 			+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
 		List<KeyValue> properties = new ArrayList<>();
 		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
 				ModelConstants.ORG_PERSON_PARTICIPATES,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 				OafMapperUtils
 					.dataInfo(
 						false, null, false, false,
 						OafMapperUtils
 							.qualifier(
 								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("startDate");
 			kv.setValue(row.getStartDate());
 			properties.add(kv);
 		}
 		if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("endDate");
 			kv.setValue(row.getEndDate());
 			properties.add(kv);
 		}
 		if (properties.size() > 0)
 			relation.setProperties(properties);
 		return relation;
 	}
 	private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
 		return Arrays
 			.asList(
 				OafMapperUtils
 					.getRelation(
 						source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 						ModelConstants.PERSON_PERSON_SUBRELTYPE,
 						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 						OafMapperUtils
 							.dataInfo(
 								false, null, false, false,
 								OafMapperUtils
 									.qualifier(
 										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
 										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 								"0.91"),
 						null),
 				OafMapperUtils
 					.getRelation(
 						target, source, ModelConstants.PERSON_PERSON_RELTYPE,
 						ModelConstants.PERSON_PERSON_SUBRELTYPE,
 						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 						OafMapperUtils
 							.dataInfo(
 								false, null, false, false,
 								OafMapperUtils
 									.qualifier(
 										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
 										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 								"0.91"),
 						null));
 	}
 	private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
 		if (Optional.ofNullable(w.getPids()).isPresent())
 			return w
 				.getPids()
 				.stream()
 				.map(pid -> getRelation(w.getOrcid(), pid))
 				.filter(Objects::nonNull)
 				.collect(Collectors.toList())
 				.iterator();
 		List<Relation> ret = new ArrayList<>();
 		return ret.iterator();
 	}
 	private static Relation getRelation(String orcid, eu.dnetlib.dhp.collection.orcid.model.Pid pid) {
 		String target;
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
 		switch (pid.getSchema()) {
 			case "doi":
 				target = DOI_PREFIX
 					+ IdentifierFactory
 						.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), pid.getValue()));
 				break;
 			case "pmid":
 				target = PMID_PREFIX
 					+ IdentifierFactory
 						.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pid.getValue()));
 				break;
 			case "arxiv":
 				target = ARXIV_PREFIX
 					+ IdentifierFactory
 						.md5(PidCleaner.normalizePidValue(PidType.arXiv.toString(), pid.getValue()));
 				break;
 			case "pmcid":
 				target = PMCID_PREFIX
 					+ IdentifierFactory
 						.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), pid.getValue()));
 				break;
 			default:
 				return null;
 		}
 		return OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.RESULT_PERSON_RELTYPE,
 				ModelConstants.RESULT_PERSON_SUBRELTYPE,
 				ModelConstants.RESULT_PERSON_HASAUTHORED,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 				OafMapperUtils
 					.dataInfo(
 						false, null, false, false,
 						OafMapperUtils
 							.qualifier(
 								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/WorkList.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/WorkList.java
@ -0,0 +1,25 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import java.io.Serializable;
 import java.util.ArrayList;
 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
 import eu.dnetlib.dhp.collection.orcid.model.Work;
 public class WorkList implements Serializable {
 	private ArrayList<Work> workArrayList;
 	public ArrayList<Work> getWorkArrayList() {
 		return workArrayList;
 	}
 	public void setWorkArrayList(ArrayList<Work> workArrayList) {
 		this.workArrayList = workArrayList;
 	}
 	public WorkList() {
 		workArrayList = new ArrayList<>();
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.types.StructType;
@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		final String blackListInputPath = parser.get("blackListPath");
 		log.info("blackListInputPath: {}", blackListInputPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
@ -77,25 +81,31 @@ public class CreateActionSetFromWebEntries implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
-				createActionSet(spark, inputPath, outputPath);
+				createActionSet(spark, inputPath, outputPath, blackListInputPath);
 			});
 	}
 	public static void createActionSet(SparkSession spark, String inputPath,
-		String outputPath) {
+		String outputPath, String blackListInputPath) {
 		final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
-			.filter("publication_year <= 2020 or country_code=='IE'")
+			.filter("country_code=='IE'")
 			.drop("publication_year");
-		dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
+		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
 		dataset
 			.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
 			.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
 			.drop("OpenAlexId")
 			.flatMap((FlatMapFunction<Row, Relation>) row -> {
 				List<Relation> ret = new ArrayList<>();
 				final String ror = ROR_PREFIX
 					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
 				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
-			ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+//				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
-			ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+//				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
 				return ret
 					.iterator();
@ -129,11 +139,26 @@ public class CreateActionSetFromWebEntries implements Serializable {
 				"institution", functions
 					.explode(
 						functions.col("institutions")))
 			.selectExpr(
-				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
+				"id", "doi", "institution.ror as ror",
 				"institution.country_code as country_code", "publication_year")
 			.distinct();
 //			.selectExpr(
 //				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
 //				"institution.country_code as country_code", "publication_year")
 //			.distinct();
 	}
 	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
 		return spark
 			.read()
 			.option("header", true)
 			.csv(inputPath)
 			.select("OpenAlexId");
 	}
 	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java
@ -20,6 +20,9 @@ public class Author extends ORCIDItem {
 	private String lastModifiedDate;
 	public Author() {
 	}
 	public String getBiography() {
 		return biography;
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java
@ -11,4 +11,7 @@ public class ORCIDItem {
 	public void setOrcid(String orcid) {
 		this.orcid = orcid;
 	}
 	public ORCIDItem() {
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java
@ -32,4 +32,6 @@ public class Work extends ORCIDItem {
 		pids.add(pid);
 	}
 	public Work() {
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.collection.plugin.rest;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
 import org.apache.commons.lang3.StringUtils;
 import com.google.gson.Gson;
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
 		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
 		Gson gson = new Gson();
 		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
 		if (StringUtils.isBlank(queryParams)) {
 			throw new CollectorException("Param 'queryParams' is null or empty");
 		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
 			requestHeaders);
 		return StreamSupport
 			.stream(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -9,8 +9,11 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@ -22,14 +25,14 @@ import javax.xml.xpath.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.http.HttpHeaders;
 import org.apache.http.entity.ContentType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;
@ -44,23 +47,28 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 *
 */
 public class RestIterator implements Iterator<String> {
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
 	private static final int MAX_ATTEMPTS = 5;
 	private final HttpClientParams clientParams;
-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
 	private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
 	private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
 		+ ">";
 	private final String baseUrl;
 	private final String resumptionType;
 	private final String resumptionParam;
 	private final String resultFormatValue;
-	private String queryParams;
+	private String queryParams = "";
 	private final int resultSizeValue;
 	private int resumptionInt = 0; // integer resumption token (first record to harvest)
 	private int resultTotal = -1;
-	private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
+	private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
 																			// harvest
 	// or token scanned from results)
 	private InputStream resultStream;
 	private Transformer transformer;
@ -73,7 +81,7 @@ public class RestIterator implements Iterator<String> {
 	private final String querySize;
 	private final String authMethod;
 	private final String authToken;
-	private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
+	private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
 	private int discoverResultSize = 0;
 	private int pagination = 1;
 	/*
@ -83,8 +91,13 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;
-	/** RestIterator class
+	/*
-	 *  compatible to version 1.3.33
+	 * Can be used to set additional request headers, like for content negotiation
 	 */
 	private Map<String, String> requestHeaders;
 	/**
 	 * RestIterator class compatible to version 1.3.33
 	 */
 	public RestIterator(
 		final HttpClientParams clientParams,
@ -101,47 +114,56 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
 		final Map<String, String> requestHeaders) {
 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
 		this.resumptionType = resumptionType;
 		this.resumptionParam = resumptionParam;
 		this.resultFormatValue = resultFormatValue;
-		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+		this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
 		this.queryParams = queryParams;
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
 		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
-		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
 			: "";
 		this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
 			: "";
 		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
 		try {
 			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
-		} catch (Exception e) {
+		} catch (final Exception e) {
 			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
 		}
 		initQueue();
 	}
-	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+	private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
 		final String entityXpath)
 		throws TransformerConfigurationException, XPathExpressionException {
 		final TransformerFactory factory = TransformerFactory.newInstance();
-		transformer = factory.newTransformer();
+		this.transformer = factory.newTransformer();
-		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+		this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+		this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
-		xpath = XPathFactory.newInstance().newXPath();
+		this.xpath = XPathFactory.newInstance().newXPath();
-		xprResultTotalPath = xpath.compile(resultTotalXpath);
+		this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
-		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+		this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
-		xprEntity = xpath.compile(entityXpath);
+		this.xprEntity = this.xpath.compile(entityXpath);
 	}
 	private void initQueue() {
 		if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
 			query = baseUrl;
 		} else {
 			query = baseUrl + "?" + queryParams + querySize + queryFormat;
-		log.info("REST calls starting with {}", query);
+		}
 		log.info("REST calls starting with {}", this.query);
 	}
 	private void disconnect() {
@ -154,11 +176,22 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public boolean hasNext() {
-		if (recordQueue.isEmpty() && query.isEmpty()) {
+		synchronized (this.recordQueue) {
 			while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
 				try {
 					this.query = downloadPage(this.query, 0);
 				} catch (final CollectorException e) {
 					log.debug("CollectorPlugin.next()-Exception: {}", e);
 					throw new RuntimeException(e);
 				}
 			}
 			if (!this.recordQueue.isEmpty()) {
 				return true;
 			}
 			disconnect();
 			return false;
 		} else {
 			return true;
 		}
 	}
@ -168,27 +201,34 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public String next() {
-		synchronized (recordQueue) {
+		synchronized (this.recordQueue) {
-			while (recordQueue.isEmpty() && !query.isEmpty()) {
+			return this.recordQueue.poll();
 				try {
 					query = downloadPage(query);
 				} catch (CollectorException e) {
 					log.debug("CollectorPlugin.next()-Exception: {}", e);
 					throw new RuntimeException(e);
 				}
 			}
 			return recordQueue.poll();
 		}
 	}
 	/*
-	 * download page and return nextQuery
+	 * download page and return nextQuery (with number of attempt)
 	 */
-	private String downloadPage(String query) throws CollectorException {
+	private String downloadPage(String query, final int attempt) throws CollectorException {
 		if (attempt > MAX_ATTEMPTS) {
 			throw new CollectorException("Max Number of attempts reached, query:" + query);
 		}
 		if (attempt > 0) {
 			final int delay = (attempt * 5000);
 			log.debug("Attempt {} with delay {}", attempt, delay);
 			try {
 				Thread.sleep(delay);
 			} catch (final InterruptedException e) {
 				new CollectorException(e);
 			}
 		}
 		try {
 			String resultJson;
-		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+			String resultXml = XML_HEADER;
 			String nextQuery = "";
 		String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
 			Node resultNode = null;
 			NodeList nodeList = null;
 			String qUrlArgument = "";
@ -196,81 +236,96 @@ public class RestIterator implements Iterator<String> {
 			InputStream theHttpInputStream;
 			// check if cursor=* is initial set otherwise add it to the queryParam URL
-		if (resumptionType.equalsIgnoreCase("deep-cursor")) {
+			if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
 				log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
 				if (!query.contains("&cursor=")) {
 					query += "&cursor=*";
 				}
 			}
 			// find pagination page start number in queryParam and remove before start the first query
 			if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
 				&& (query.contains("paginationStart="))) {
 				final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
 				m.find(); // guaranteed to be true for this regex
 				String[] pageVal = m.group(0).split("=");
 				pagination = Integer.parseInt(pageVal[1]);
 				// remove page start number from query and queryParams
 				queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
 				query = query.replaceFirst("&?paginationStart=[0-9]+", "");
 			}
 			try {
-			log.info("requestig URL [{}]", query);
+				log.info("requesting URL [{}]", query);
-			URL qUrl = new URL(query);
+				final URL qUrl = new URL(query);
-			log.debug("authMethod: {}", authMethod);
+				log.debug("authMethod: {}", this.authMethod);
-			if ("bearer".equalsIgnoreCase(this.authMethod)) {
+				if (this.authMethod == "bearer") {
-				log.trace("authMethod before inputStream: {}", resultXml);
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
-				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+					requestHeaders.put("Authorization", "Bearer " + authToken);
-				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
+					// requestHeaders.put("Content-Type", "application/json");
-				conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
+				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
-				conn.setRequestMethod("GET");
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
-				theHttpInputStream = conn.getInputStream();
+					requestHeaders.put("Authorization", "Basic " + authToken);
-			} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
+					// requestHeaders.put("accept", "application/xml");
 				log.trace("authMethod before inputStream: {}", resultXml);
 				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
 				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
 				conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
 				conn.setRequestMethod("GET");
 				theHttpInputStream = conn.getInputStream();
 			} else {
 				theHttpInputStream = qUrl.openStream();
 				}
 				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
 				conn.setRequestMethod("GET");
 				this.setRequestHeader(conn);
 				resultStream = conn.getInputStream();
-			resultStream = theHttpInputStream;
+				if ("json".equals(this.resultOutputFormat)) {
-			if ("json".equals(resultOutputFormat)) {
+					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
 				resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
 					resultXml = JsonUtils.convertToXML(resultJson);
-				resultStream = IOUtils.toInputStream(resultXml, UTF_8);
+					this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
 				}
-			if (!(emptyXml).equalsIgnoreCase(resultXml)) {
+				if (!isEmptyXml(resultXml)) {
-				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
+					resultNode = (Node) this.xpath
-				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
+						.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
 					nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
 					log.debug("nodeList.length: {}", nodeList.getLength());
 					for (int i = 0; i < nodeList.getLength(); i++) {
-					StringWriter sw = new StringWriter();
+						final StringWriter sw = new StringWriter();
-					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
+						this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
-					String toEnqueue = sw.toString();
+						final String toEnqueue = sw.toString();
-					if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
+						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
-						log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
+							log
 								.warn(
 									"The following record resulted in empty item for the feeding queue: {}", resultXml);
 						} else {
-						recordQueue.add(sw.toString());
+							this.recordQueue.add(sw.toString());
 						}
 					}
 				} else {
 					log.warn("resultXml is equal with emptyXml");
 				}
-			resumptionInt += resultSizeValue;
+				this.resumptionInt += this.resultSizeValue;
-			switch (resumptionType.toLowerCase()) {
+				switch (this.resumptionType.toLowerCase()) {
 					case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
-					resumptionStr = xprResumptionPath.evaluate(resultNode);
+						this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
 						break;
 					case "count": // begin at one step for all records, iterate over items
-					resumptionStr = Integer.toString(resumptionInt);
+						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;
 					case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
-					if (resultSizeValue < 2) {
+						if (this.resultSizeValue < 2) {
 							throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
 						}
 						qUrlArgument = qUrl.getQuery();
-					String[] arrayQUrlArgument = qUrlArgument.split("&");
+
-					for (String arrayUrlArgStr : arrayQUrlArgument) {
+						final String[] arrayQUrlArgument = qUrlArgument.split("&");
-						if (arrayUrlArgStr.startsWith(resumptionParam)) {
+						for (final String arrayUrlArgStr : arrayQUrlArgument) {
-							String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+							if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
 								final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
 								if (isInteger(resumptionKeyValue[1])) {
 									urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
 									log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
@ -280,61 +335,63 @@ public class RestIterator implements Iterator<String> {
 							}
 						}
-					if (((emptyXml).equalsIgnoreCase(resultXml))
+						if (isEmptyXml(resultXml)
-						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
+							|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
 							// resumptionStr = "";
 							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
-						resultTotal = discoverResultSize;
+							this.resultTotal = this.discoverResultSize;
 						} else {
-						resumptionStr = Integer.toString(resumptionInt);
+							this.resumptionStr = Integer.toString(this.resumptionInt);
-						resultTotal = resumptionInt + 1;
+							this.resultTotal = this.resumptionInt + 1;
 							if (nodeList != null) {
-							discoverResultSize += nodeList.getLength();
+								this.discoverResultSize += nodeList.getLength();
 							}
 						}
-					log.info("discoverResultSize: {}", discoverResultSize);
+						log.info("discoverResultSize: {}", this.discoverResultSize);
 						break;
 					case "pagination":
 					case "page": // pagination, iterate over page numbers
-					pagination += 1;
+						if (nodeList != null && nodeList.getLength() > 0) {
-					if (nodeList != null) {
+							this.discoverResultSize += nodeList.getLength();
 						discoverResultSize += nodeList.getLength();
 						} else {
-						resultTotal = discoverResultSize;
+							this.resultTotal = this.discoverResultSize;
-						pagination = discoverResultSize;
+							this.pagination = this.discoverResultSize;
 						}
-					resumptionInt = pagination;
+						this.pagination += 1;
-					resumptionStr = Integer.toString(resumptionInt);
+						this.resumptionInt = this.pagination;
 						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;
-				case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
+					case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
 										// in
 										// solr)
 						// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
 						// deep-cursor, Param 'resultSizeValue' is less than 2");}
-					resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
+						this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
-					queryParams = queryParams.replace("&cursor=*", "");
+						this.queryParams = this.queryParams.replace("&cursor=*", "");
 						// terminating if length of nodeList is 0
-					if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
+						if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
-						resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
+							this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
 						} else {
-						resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
+							this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
 																									// resultSizeValue
 							// because the iteration is over
 							// real length and the
 							// resultSizeValue is added before
 							// the switch()
 						}
-					discoverResultSize = nodeList.getLength();
+						this.discoverResultSize = nodeList.getLength();
 						log
 							.debug(
-							"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+								"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
-								+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
+									+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
 						break;
@ -343,28 +400,30 @@ public class RestIterator implements Iterator<String> {
 						break;
 				}
-		} catch (Exception e) {
+			} catch (final Exception e) {
 				log.error(e.getMessage(), e);
 				throw new IllegalStateException("collection failed: " + e.getMessage());
 			}
 			try {
-			if (resultTotal == -1) {
+				if (this.resultTotal == -1) {
-				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
+					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
-				if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
+					if ("page".equalsIgnoreCase(this.resumptionType)
-					resultTotal += 1;
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
 						this.resultTotal += 1;
 					} // to correct the upper bound
-				log.info("resultTotal was -1 is now: " + resultTotal);
+					log.info("resultTotal was -1 is now: " + this.resultTotal);
 				}
-		} catch (Exception e) {
+			} catch (final Exception e) {
 				log.error(e.getMessage(), e);
 				throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
 			}
-		log.debug("resultTotal: " + resultTotal);
+			log.debug("resultTotal: " + this.resultTotal);
-		log.debug("resInt: " + resumptionInt);
+			log.debug("resInt: " + this.resumptionInt);
-		if (resumptionInt <= resultTotal) {
+			if (this.resumptionInt <= this.resultTotal) {
-			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+				nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
-				+ queryFormat;
+					+ this.resumptionStr
 					+ this.queryFormat;
 			} else {
 				nextQuery = "";
 				// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
@ -372,10 +431,18 @@ public class RestIterator implements Iterator<String> {
 			}
 			log.debug("nextQueryUrl: " + nextQuery);
 			return nextQuery;
 		} catch (final Throwable e) {
 			log.warn(e.getMessage(), e);
 			return downloadPage(query, attempt + 1);
 		}
 	}
-	private boolean isInteger(String s) {
+	private boolean isEmptyXml(String s) {
 		return EMPTY_XML.equalsIgnoreCase(s);
 	}
 	private boolean isInteger(final String s) {
 		boolean isValidInteger = false;
 		try {
 			Integer.parseInt(s);
@ -383,7 +450,7 @@ public class RestIterator implements Iterator<String> {
 			// s is a valid integer
 			isValidInteger = true;
-		} catch (NumberFormatException ex) {
+		} catch (final NumberFormatException ex) {
 			// s is not an integer
 		}
@ -391,20 +458,36 @@ public class RestIterator implements Iterator<String> {
 	}
 	// Method to encode a string value using `UTF-8` encoding scheme
-	private String encodeValue(String value) {
+	private String encodeValue(final String value) {
 		try {
 			return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
-		} catch (UnsupportedEncodingException ex) {
+		} catch (final UnsupportedEncodingException ex) {
 			throw new RuntimeException(ex.getCause());
 		}
 	}
 	/**
 	 * setRequestHeader
 	 *
 	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
 	 * @param conn
 	 */
 	private void setRequestHeader(HttpURLConnection conn) {
 		if (requestHeaders != null) {
 			for (String key : requestHeaders.keySet()) {
 				conn.setRequestProperty(key, requestHeaders.get(key));
 			}
 			log.debug("Set Request Header with: " + requestHeaders);
 		}
 	}
 	public String getResultFormatValue() {
-		return resultFormatValue;
+		return this.resultFormatValue;
 	}
 	public String getResultOutputFormat() {
-		return resultOutputFormat;
+		return this.resultOutputFormat;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -8,7 +8,10 @@ import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Collectors;
 import javax.xml.stream.XMLEventFactory;
 import javax.xml.stream.XMLEventReader;
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.StartElement;
 import javax.xml.stream.events.XMLEvent;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {
 	private String element;
 	private List<String> elements;
 	private InputStream inputStream;
 	public XMLIterator(final String element, final InputStream inputStream) {
 		super();
 		this.element = element;
 		if (element.contains(",")) {
 			elements = Arrays
 				.stream(element.split(","))
 				.filter(StringUtils::isNoneBlank)
 				.map(String::toLowerCase)
 				.collect(Collectors.toList());
 		}
 		this.inputStream = inputStream;
 		this.parser = getParser();
 		try {
 			this.current = findElement(parser);
 		} catch (XMLStreamException e) {
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
 				final XMLEvent event = parser.nextEvent();
 				// TODO: replace with depth tracking instead of close tag tracking.
-				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+				if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
 					writer.add(event);
 					break;
 				}
@ -142,31 +156,48 @@ public class XMLIterator implements Iterator<String> {
 		XMLEvent peek = parser.peek();
 		if (peek != null && peek.isStartElement()) {
 			String name = peek.asStartElement().getName().getLocalPart();
-			if (element.equals(name)) {
+			if (isCheckTag(name))
 				return peek;
 		}
 		}
 		while (parser.hasNext()) {
-			final XMLEvent event = parser.nextEvent();
+			XMLEvent event = parser.nextEvent();
 			if (event != null && event.isStartElement()) {
 				String name = event.asStartElement().getName().getLocalPart();
-				if (element.equals(name)) {
+				if (isCheckTag(name))
 					return event;
 			}
 		}
 		}
 		return null;
 	}
 	private XMLEventReader getParser() {
 		try {
-			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+			XMLInputFactory xif = inputFactory.get();
 			xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
 			return xif.createXMLEventReader(sanitize(inputStream));
 		} catch (XMLStreamException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private boolean isCheckTag(final String tagName) {
 		if (elements != null) {
 			final String found = elements
 				.stream()
 				.filter(e -> e.equalsIgnoreCase(tagName))
 				.findFirst()
 				.orElse(null);
 			if (found != null)
 				return true;
 		} else {
 			if (element.equalsIgnoreCase(tagName)) {
 				return true;
 			}
 		}
 		return false;
 	}
 	private Reader sanitize(final InputStream in) {
 		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
 		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
@ -0,0 +1,25 @@
 [
  {
    "paramName": "ip",
    "paramLongName": "inputPath",
    "paramDescription": "the zipped opencitations file",
    "paramRequired": true
  },
  {
    "paramName": "op",
    "paramLongName": "outputPath",
    "paramDescription": "the working path",
    "paramRequired": true
  },
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  }, {
  "paramName": "wd",
  "paramLongName": "workingDir",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
@ -0,0 +1,2 @@
 inputPath=/data/orcid_2023/tables/
 outputPath=/user/miriam.baglioni/peopleAS
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/config-default.xml
@ -0,0 +1,30 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>hiveJdbcUrl</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
    <property>
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
@ -0,0 +1,111 @@
 <workflow-app name="PersonEntity" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>inputPath</name>
            <description>inputPath</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="deleteoutputpath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="deleteoutputpath">
        <fs>
            <delete path="${outputPath}"/>
            <mkdir path="${outputPath}"/>
            <delete path="${workingDir}"/>
            <mkdir path="${workingDir}"/>
        </fs>
        <ok to="atomicactions"/>
        <error to="Kill"/>
    </action>
    <action name="atomicactions">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the ActionSet for Person entity and relevant relations</name>
            <class>eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=4
                --executor-memory=4G
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=5G
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@ -16,5 +16,10 @@
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  },{
  "paramName": "bl",
  "paramLongName": "blackListPath",
  "paramDescription": "the working path",
  "paramRequired": true
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@ -1,2 +1,3 @@
 sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
 outputPath=/tmp/miriam/webcrawlComplete/
 blackListPath=/user/miriam.baglioni/openalex-blackList
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@ -45,6 +45,7 @@
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--blackListPath</arg><arg>${blackListPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -58,7 +58,7 @@
    "uri": "http://dx.doi.org/10.13039/100010414",
    "name": "Health Research Board",
    "synonym": [
-      "501100001590"
+      "501100001590", "501100023273"
    ]
  },
  {
@ -85,24 +85,6 @@
    "name": "Irish College of General Practitioners",
    "synonym": []
  },
  {
    "id": "100012734",
    "uri": "http://dx.doi.org/10.13039/100012734",
    "name": "Department for Culture, Heritage and the Gaeltacht, Ireland",
    "synonym": []
  },
  {
    "id": "100012754",
    "uri": "http://dx.doi.org/10.13039/100012754",
    "name": "Horizon Pharma",
    "synonym": []
  },
  {
    "id": "100012891",
    "uri": "http://dx.doi.org/10.13039/100012891",
    "name": "Medical Research Charities Group",
    "synonym": []
  },
  {
    "id": "100012919",
    "uri": "http://dx.doi.org/10.13039/100012919",
@ -233,7 +215,7 @@
    "id": "100018064",
    "uri": "http://dx.doi.org/10.13039/100018064",
    "name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
-    "synonym": []
+    "synonym": ["100012734"]
  },
  {
    "id": "100018172",
@ -281,13 +263,13 @@
    "id": "100019637",
    "uri": "http://dx.doi.org/10.13039/100019637",
    "name": "Horizon Therapeutics",
-    "synonym": []
+    "synonym": ["100012754"]
  },
  {
    "id": "100020174",
    "uri": "http://dx.doi.org/10.13039/100020174",
    "name": "Health Research Charities Ireland",
-    "synonym": []
+    "synonym": ["100012891"]
  },
  {
    "id": "100020202",
@ -319,12 +301,7 @@
    "name": "Centre for Ageing Research and Development in Ireland",
    "synonym": []
  },
-  {
+
    "id": "501100001583",
    "uri": "http://dx.doi.org/10.13039/501100001583",
    "name": "Cystinosis Foundation Ireland",
    "synonym": []
  },
  {
    "id": "501100001584",
    "uri": "http://dx.doi.org/10.13039/501100001584",
@ -521,7 +498,7 @@
    "id": "501100003037",
    "uri": "http://dx.doi.org/10.13039/501100003037",
    "name": "Elan",
-    "synonym": []
+    "synonym": ["501100021694"]
  },
  {
    "id": "501100003496",
@ -595,17 +572,11 @@
    "name": "Technological University Dublin",
    "synonym": []
  },
  {
    "id": "501100009269",
    "uri": "http://dx.doi.org/10.13039/501100009269",
    "name": "Programme of Competitive Forestry Research for Development",
    "synonym": []
  },
  {
    "id": "501100009315",
    "uri": "http://dx.doi.org/10.13039/501100009315",
    "name": "Cystinosis Ireland",
-    "synonym": []
+    "synonym": ["501100001583"]
  },
  {
    "id": "501100010808",
@ -625,12 +596,6 @@
    "name": "Alimentary Health",
    "synonym": []
  },
  {
    "id": "501100011103",
    "uri": "http://dx.doi.org/10.13039/501100011103",
    "name": "Rann\u00eds",
    "synonym": []
  },
  {
    "id": "501100012354",
    "uri": "http://dx.doi.org/10.13039/501100012354",
@ -733,12 +698,6 @@
    "name": "Insight SFI Research Centre for Data Analytics",
    "synonym": []
  },
  {
    "id": "501100021694",
    "uri": "http://dx.doi.org/10.13039/501100021694",
    "name": "Elan Pharma International",
    "synonym": []
  },
  {
    "id": "501100021838",
    "uri": "http://dx.doi.org/10.13039/501100021838",
@ -769,12 +728,6 @@
    "name": "Institute of Technology, Tralee",
    "synonym": []
  },
  {
    "id": "501100023273",
    "uri": "http://dx.doi.org/10.13039/501100023273",
    "name": "HRB Clinical Research Facility Galway",
    "synonym": []
  },
  {
    "id": "501100023378",
    "uri": "http://dx.doi.org/10.13039/501100023378",
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@ -79,23 +79,6 @@ object MagUtility extends Serializable {
  private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
  private val MAGDataInfo: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
    di.setInvisible(false)
    di.setTrust("0.9")
    di.setProvenanceaction(
      OafMapperUtils.qualifier(
        ModelConstants.SYSIMPORT_ACTIONSET,
        ModelConstants.SYSIMPORT_ACTIONSET,
        ModelConstants.DNET_PROVENANCE_ACTIONS,
        ModelConstants.DNET_PROVENANCE_ACTIONS
      )
    )
    di
  }
  private val MAGDataInfoInvisible: DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
@ -453,7 +436,6 @@ object MagUtility extends Serializable {
      case "repository" =>
        result = new Publication()
        result.setDataInfo(MAGDataInfoInvisible)
        qualifier(
          "0038",
          "Other literature type",
@ -488,7 +470,6 @@ object MagUtility extends Serializable {
    }
    if (result != null) {
      if (result.getDataInfo == null)
      result.setDataInfo(MAGDataInfo)
      val i = new Instance
      i.setInstancetype(tp)
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
      return null
    result.setCollectedfrom(List(MAGCollectedFrom).asJava)
-    val pidList = List(
+    var pidList = List(
      structuredProperty(
        paper.paperId.get.toString,
        qualifier(
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
      )
    )
    result.setPid(pidList.asJava)
    result.setOriginalId(pidList.map(s => s.getValue).asJava)
    result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@ -618,10 +597,9 @@ object MagUtility extends Serializable {
    }
    val instance = result.getInstance().get(0)
-    instance.setPid(pidList.asJava)
+
-    if (paper.doi.orNull != null)
+    if (paper.doi.orNull != null) {
-      instance.setAlternateIdentifier(
+      pidList = pidList ::: List(
        List(
        structuredProperty(
          paper.doi.get,
          qualifier(
@ -632,8 +610,10 @@ object MagUtility extends Serializable {
          ),
          null
        )
        ).asJava
      )
    }
    instance.setPid(pidList.asJava)
    result.setPid(pidList.asJava)
    instance.setUrl(paper.urls.get.asJava)
    instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
    instance.setCollectedfrom(MAGCollectedFrom)
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
    spark.read
      .load(s"$magBasePath/mag_denormalized")
      .as[MAGPaper]
      .filter(col("doi").isNotNull)
      .map(s => MagUtility.convertMAGtoOAF(s))
      .filter(s => s != null)
      .write
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
 import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
 import eu.dnetlib.dhp.sx.bio.pubmed._
 import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}
-import java.io.InputStream
+import java.io.{ByteArrayInputStream, InputStream}
-import scala.io.Source
+import java.nio.charset.Charset
-import scala.xml.pull.XMLEventReader
+import javax.xml.stream.XMLInputFactory
 object SparkCreateBaselineDataFrame {
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)
-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
+    val targetPath = parser.get("targetPath")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
+    log.info("targetPath: {}", targetPath)
    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
    log.info("outputBasePath: {}", outputBasePath)
    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)
    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )
    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
    val mdStoreSize = df.count
    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed
 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
 import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
 					"-format",
 					"COCI"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/person/CreatePersonAS.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/person/CreatePersonAS.java
@ -0,0 +1,224 @@
 package eu.dnetlib.dhp.actionmanager.person;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Optional;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob;
 import eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson;
 import eu.dnetlib.dhp.collection.orcid.model.Author;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Person;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.utils.DHPUtils;
 public class CreatePersonAS {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	private static final Logger log = LoggerFactory
 		.getLogger(CreatePersonAS.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files
 			.createTempDirectory(CreatePersonAS.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(CreatePersonAS.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.codegen.wholeStage", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(CreatePersonAS.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void testAuthors() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/person/")
 			.getPath();
 //		spark
 //				.read()
 //				.parquet(inputPath + "Authors")
 //				.as(Encoders.bean(Author.class))
 //						.filter((FilterFunction<Author>) a -> Optional.ofNullable(a.getOtherNames()).isPresent() &&
 //								Optional.ofNullable(a.getBiography()).isPresent())
 //								.write()
 //										.mode(SaveMode.Overwrite)
 //												.parquet(workingDir.toString() + "AuthorsSubset");
 		ExtractPerson
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet1",
 					"-workingDir",
 					workingDir.toString() + "/working"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> relations = sc
 			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
 			.filter(v -> "eu.dnetlib.dhp.schema.oaf.Relation".equalsIgnoreCase(v._1().toString()))
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 //
 		JavaRDD<Person> people = sc
 			.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
 			.filter(v -> "eu.dnetlib.dhp.schema.oaf.Person".equalsIgnoreCase(v._1().toString()))
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Person) aa.getPayload()));
 //
 		Assertions.assertEquals(7, people.count());
 		Assertions
 			.assertEquals(
 				"Paulo",
 				people
 					.filter(
 						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
 					.first()
 					.getGivenName());
 		Assertions
 			.assertEquals(
 				"Tavares",
 				people
 					.filter(
 						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
 					.first()
 					.getFamilyName());
 		Assertions
 			.assertEquals(
 				4,
 				people
 					.filter(
 						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
 					.first()
 					.getAlternativeNames()
 					.size());
 		Assertions
 			.assertEquals(
 				4,
 				people
 					.filter(
 						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
 					.first()
 					.getPid()
 					.size());
 		Assertions
 			.assertTrue(
 				people
 					.filter(
 						p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
 					.first()
 					.getPid()
 					.stream()
 					.anyMatch(
 						p -> p.getSchema().equalsIgnoreCase("Scopus Author ID")
 							&& p.getValue().equalsIgnoreCase("15119405200")));
 		Assertions
 			.assertEquals(
 				16,
 				relations
 					.filter(r -> r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED))
 					.count());
 		Assertions
 			.assertEquals(
 				14,
 				relations
 					.filter(r -> r.getRelClass().equalsIgnoreCase(ModelConstants.PERSON_PERSON_HASCOAUTHORED))
 					.count());
 		Assertions
 			.assertEquals(
 				3,
 				relations
 					.filter(
 						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
 							&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED))
 					.count());
 		Assertions
 			.assertEquals(
 				2,
 				relations
 					.filter(
 						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
 							&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED)
 							&& r.getTarget().startsWith("50|doi"))
 					.count());
 		Assertions
 			.assertEquals(
 				1,
 				relations
 					.filter(
 						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
 							&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED)
 							&& r.getTarget().startsWith("50|arXiv"))
 					.count());
 		Assertions
 			.assertEquals(
 				1,
 				relations
 					.filter(
 						r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
 							&& r.getRelClass().equalsIgnoreCase(ModelConstants.PERSON_PERSON_HASCOAUTHORED))
 					.count());
 		Assertions.assertEquals(33, relations.count());
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@ -75,7 +75,11 @@ public class CreateASTest {
 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
 			.getPath();
 		String blackListPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
 			.getPath();
 		CreateActionSetFromWebEntries
@ -86,7 +90,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
 					"-blackListPath", blackListPath
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -96,7 +101,7 @@ public class CreateASTest {
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
-		Assertions.assertEquals(64, tmp.count());
+		Assertions.assertEquals(58, tmp.count());
 	}
@ -109,6 +114,10 @@ public class CreateASTest {
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
 			.getPath();
 		String blackListPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
 			.getPath();
 		CreateActionSetFromWebEntries
 			.main(
@ -118,7 +127,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
 					"-blackListPath", blackListPath
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -184,7 +194,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getSource()
@ -197,7 +207,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -210,7 +220,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -224,7 +234,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@ -238,7 +248,7 @@ public class CreateASTest {
 		Assertions
 			.assertEquals(
-				1, tmp
+				0, tmp
 					.filter(
 						r -> r
 							.getTarget()
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
@ -0,0 +1,64 @@
 package eu.dnetlib.dhp.collection.plugin.file;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Objects;
 import java.util.stream.Stream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.junit.jupiter.MockitoExtension;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class)
 public class FileGZipMultipleNodeTest {
 	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
 	private final ApiDescriptor api = new ApiDescriptor();
 	private FileGZipCollectorPlugin plugin;
 	private static final String SPLIT_ON_ELEMENT = "incollection,article";
 	@BeforeEach
 	public void setUp() throws IOException {
 		final String gzipFile = Objects
 			.requireNonNull(
 				this
 					.getClass()
 					.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
 			.getFile();
 		api.setBaseUrl(gzipFile);
 		HashMap<String, String> params = new HashMap<>();
 		params.put("splitOnElement", SPLIT_ON_ELEMENT);
 		api.setParams(params);
 		FileSystem fs = FileSystem.get(new Configuration());
 		plugin = new FileGZipCollectorPlugin(fs);
 	}
 	@Test
 	void test() throws CollectorException {
 		final Stream<String> stream = plugin.collect(api, new AggregatorReport());
 		stream.limit(10).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			log.info(s);
 		});
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Stream;
 import org.junit.jupiter.api.Assertions;
@ -35,11 +36,11 @@ public class OsfPreprintCollectorTest {
 	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
 	private final String resumptionParam = "page";
-	private final String resumptionType = "page";
+	private final String resumptionType = "scan";
-	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+	private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";
-	private final String resultSizeParam = "";
+	private final String resultSizeParam = "page[size]";
-	private final String resultSizeValue = "";
+	private final String resultSizeValue = "100";
 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -69,11 +70,11 @@ public class OsfPreprintCollectorTest {
 	@Test
 	@Disabled
-	void test() throws CollectorException {
+	void test_limited() throws CollectorException {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
 	@Test
 	@Disabled
 	void test_all() throws CollectorException {
 		final AtomicLong i = new AtomicLong(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
 		stream.forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			if ((i.incrementAndGet() % 1000) == 0) {
 				log.info("COLLECTED: {}", i.get());
 			}
 		});
 		log.info("TOTAL: {}", i.get());
 		Assertions.assertTrue(i.get() > 0);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@ -4,6 +4,11 @@
 package eu.dnetlib.dhp.collection.plugin.rest;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.google.gson.Gson;
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@ -25,18 +32,18 @@ class RestCollectorPluginTest {
 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
-	private final String resumptionType = "count";
+	private final String resumptionType = "discover";
-	private final String resumptionParam = "from";
+	private final String resumptionParam = "skip";
-	private final String entityXpath = "//hits/hits";
+	private final String entityXpath = "//*[local-name()='data']";
-	private final String resumptionXpath = "//hits";
+	private final String resumptionXpath = "";
-	private final String resultTotalXpath = "//hits/total";
+	private final String resultTotalXpath = "//*[local-name()='count']";
-	private final String resultFormatParam = "format";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";
 	private final String protocolDescriptor = "rest_json2xml";
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
 		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
 	@Disabled
 	@Test
 	void testUrl() throws IOException {
 		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
 		URL url = new URL(url_s);
 		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
 		conn.setRequestMethod("GET");
 		conn.setRequestProperty("User-Agent", "OpenAIRE");
 		Gson gson = new Gson();
 		System.out.println("Request header");
 		System.out.println(gson.toJson(conn.getHeaderFields()));
 		InputStream inputStream = conn.getInputStream();
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@ -44,7 +44,7 @@ public class RestIteratorTest {
 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/person/WorkJson/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/person/WorkJson/part-00000
@ -0,0 +1,10 @@
 {"orcid":"0000-0001-6291-9619","title":"A Visible Light Driven Photoelectrochemical Chloramphenicol Aptasensor Based on a Gold Nanoparticle-Functionalized 3D Flower-like MoS<sub>2</sub>/TiO<sub>2</sub> Heterostructure","pids":[{"value":"10.1021/acs.langmuir.1c02956","schema":"doi"},{"value":"2-s2.0-85124885368","schema":"eid"},{"value":"15205827 07437463","schema":"issn"}]}
 {"orcid":"0000-0002-3210-3034","title":"A Visible Light Driven Photoelectrochemical Chloramphenicol Aptasensor Based on a Gold Nanoparticle-Functionalized 3D Flower-like MoS<sub>2</sub>/TiO<sub>2</sub> Heterostructure","pids":[{"value":"10.1021/acs.langmuir.1c02956","schema":"doi"},{"value":"2-s2.0-85124885368","schema":"eid"},{"value":"15205827 07437463","schema":"issn"}]}
 {"orcid":"0000-0001-6291-9619","title":"Study of High-Transverse-Momentum Higgs Boson Production in Association with a Vector Boson in the <math display=\"inline\"><mrow><mi>q</mi><mi>q</mi><mi>b</mi><mi>b</mi></mrow></math> Final State with the ATLAS Detector","pids":[{"value":"2736741","schema":"other-id"},{"value":"10.1103/PhysRevLett.132.131802","schema":"doi"},{"value":"2312.07605","schema":"arxiv"}]}
 {"orcid":"0000-0002-3210-3034","title":"Study of High-Transverse-Momentum Higgs Boson Production in Association with a Vector Boson in the <math display=\"inline\"><mrow><mi>q</mi><mi>q</mi><mi>b</mi><mi>b</mi></mrow></math> Final State with the ATLAS Detector","pids":[{"value":"2736741","schema":"other-id"},{"value":"10.1103/PhysRevLett.132.131802","schema":"doi"},{"value":"2312.07605","schema":"arxiv"}]}
 {"orcid":"0000-0002-9030-7609","title":"Search for supersymmetry in a final state containing two photons and missing transverse momentum in √s = 13 TeV pp collisions at the LHC using the ATLAS detector","pids":[{"value":"10.1140/epjc/s10052-016-4344-x","schema":"doi"},{"value":"2-s2.0-84988710988","schema":"eid"},{"value":"14346052 14346044","schema":"issn"}]}
 {"orcid":"0000-0003-2552-9691","title":"Search for supersymmetry in a final state containing two photons and missing transverse momentum in $\\sqrt{s}$ = 13 TeV $pp$ collisions at the LHC using the ATLAS detector","pids":[{"value":"1473744","schema":"other-id"},{"value":"10.1140/epjc/s10052-016-4344-x","schema":"doi"},{"value":"1606.09150","schema":"arxiv"}]}
 {"orcid":"0000-0003-0305-8980","title":"Search for supersymmetry in a final state containing two photons and missing transverse momentum in √s = 13 TeV pp collisions at the LHC using the ATLAS detector","pids":[{"value":"10.1140/epjc/s10052-016-4344-x","schema":"doi"},{"value":"2-s2.0-84988710988","schema":"eid"}]}
 {"orcid":"0000-0002-9030-7609","title":"Measurement of the energy response of the ATLAS calorimeter to charged pions from $W^{\\pm }\\rightarrow \\tau ^{\\pm }(\\rightarrow \\pi ^{\\pm }\\nu _{\\tau })\\nu _{\\tau }$ events in Run 2 data","pids":[{"value":"1909507","schema":"other-id"},{"value":"10.1140/epjc/s10052-022-10117-2","schema":"doi"},{"value":"2108.09043","schema":"arxiv"}]}
 {"orcid":"0000-0003-2629-4046","title":"Measurement of the energy response of the ATLAS calorimeter to charged pions from $W^{\\pm }\\rightarrow \\tau ^{\\pm }(\\rightarrow \\pi ^{\\pm }\\nu _{\\tau })\\nu _{\\tau }$ events in Run 2 data","pids":[{"value":"1909507","schema":"other-id"},{"value":"10.1140/epjc/s10052-022-10117-2","schema":"doi"},{"value":"2108.09043","schema":"arxiv"}]}
 {"orcid":"0000-0001-8582-8912","title":"Measurement of the energy response of the ATLAS calorimeter to charged pions from $W^{\\pm }\\rightarrow \\tau ^{\\pm }(\\rightarrow \\pi ^{\\pm }\\nu _{\\tau })\\nu _{\\tau }$ events in Run 2 data","pids":[{"value":"1909507","schema":"other-id"},{"value":"10.1140/epjc/s10052-022-10117-2","schema":"doi"},{"value":"2108.09043","schema":"arxiv"}]}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
    {
      "value": "VALUE",
      "type": "PIPPO"
    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
 import org.apache.commons.io.IOUtils
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }
  @Test
  def mappingRecord(): Unit = {
    val input =
      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
@ -18,10 +19,8 @@ class MAGMappingTest {
      .master("local[*]")
      .getOrCreate()
-    val s = new SparkMagOrganizationAS(null, null, null)
+    val s = new SparkMAGtoOAF(null, null, null)
-
+    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
    s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
  }
  @Test
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testEBIData() = {
-    val inputXML = Source
+    val inputFactory = XMLInputFactory.newInstance
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
      .mkString
    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
+    val inputFactory = XMLInputFactory.newInstance
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
-    )
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {
-    val xml = new XMLEventReader(
+    val inputFactory = XMLInputFactory.newInstance
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -53,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
        <dependency>
            <groupId>org.scala-lang.modules</groupId>
            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
            <version>1.0.2</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang.modules</groupId>
            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
            <version>2.11.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>com.arakelian</groupId>
            <artifactId>java-jq</artifactId>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
 						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Organization;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
 					"",
-					r._1()._2().getOriginalId().get(0),
+					Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
-					r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
+					Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
-					r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
+					Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
-					r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
+					Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
-					r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
+					Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
-					r._1()._2().getCollectedfrom().get(0).getValue(),
+					Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
 					"",
 					structuredPropertyListToString(r._1()._2().getPid()),
 					parseECField(r._1()._2().getEclegalbody()),
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 					final Organization o = r._2()._2();
 					return new OrgSimRel(
 						r._1()._1(),
-						o.getOriginalId().get(0),
+						Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
 						Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
 					OrgSimRel orgSimRel = r._1()._2();
-					orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+					orgSimRel
 						.setLocal_id(
 							Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
 					return orgSimRel;
 				},
 				Encoders.bean(OrgSimRel.class));
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -0,0 +1,103 @@
 package eu.dnetlib.dhp.oa.dedup;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Serializable;
 import java.lang.reflect.InvocationTargetException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import scala.Tuple2;
 class DatasetMergerTest implements Serializable {
 	private List<Tuple2<String, Dataset>> datasets;
 	private String testEntityBasePath;
 	private DataInfo dataInfo;
 	private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148";
 	private Dataset dataset_top;
 	@BeforeEach
 	public void setUp() throws Exception {
 		testEntityBasePath = Paths
 			.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
 			.toFile()
 			.getAbsolutePath();
 		datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class);
 		dataset_top = getTopPub(datasets);
 		dataInfo = setDI();
 	}
 	@Test
 	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
 		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
 		// verify id
 		assertEquals(dedupId, pub_merged.getId());
 		assertEquals(2, pub_merged.getInstance().size());
 	}
 	public DataInfo setDI() {
 		DataInfo dataInfo = new DataInfo();
 		dataInfo.setTrust("0.9");
 		dataInfo.setDeletedbyinference(false);
 		dataInfo.setInferenceprovenance("testing");
 		dataInfo.setInferred(true);
 		return dataInfo;
 	}
 	public Dataset getTopPub(List<Tuple2<String, Dataset>> publications) {
 		Double maxTrust = 0.0;
 		Dataset maxPub = new Dataset();
 		for (Tuple2<String, Dataset> publication : publications) {
 			Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
 			if (pubTrust > maxTrust) {
 				maxTrust = pubTrust;
 				maxPub = publication._2();
 			}
 		}
 		return maxPub;
 	}
 	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
 		List<Tuple2<String, T>> res = new ArrayList<>();
 		BufferedReader reader;
 		try {
 			reader = new BufferedReader(new FileReader(path));
 			String line = reader.readLine();
 			while (line != null) {
 				res
 					.add(
 						new Tuple2<>(
 							MapDocumentUtil.getJPathString("$.id", line),
 							new ObjectMapper().readValue(line, clazz)));
 				// read next line
 				line = reader.readLine();
 			}
 			reader.close();
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 		return res;
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable {
 		assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
 		assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
 		assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
-		assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
+		assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid());
-		assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
+		assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid());
-		assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
+		assertEquals("Elsevier BV", pub_merged.getPublisher().getValue());
-		assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
+		assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue());
 		assertEquals(pub_top.getResourcetype().getClassid(), "");
 		assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
 		assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
-		assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
+		// assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
 		assertEquals(3, pub_merged.getInstance().size());
 		assertEquals(2, pub_merged.getCountry().size());
 		assertEquals(0, pub_merged.getSubject().size());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@ -172,7 +172,7 @@ public class SparkBulkTagJob {
 			.option("compression", "gzip")
 			.json(outputPath + "project");
-		readPath(spark, outputPath + "project", Datasource.class)
+		readPath(spark, outputPath + "project", Project.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -61,7 +61,8 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 					subject.getQualifier().setClassname(vocabulary.getName());
 				}
 			} else {
-				final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
+				final String provenanceActionClassId = Optional
 					.ofNullable(subject.getDataInfo())
 					.map(DataInfo::getProvenanceaction)
 					.map(Qualifier::getClassid)
 					.orElse(null);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 			o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
 			o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
 			o.setCountry(prepareQualifierSplitting(rs.getString("country")));
 			o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology")));
 			o.setDataInfo(info);
 			o.setLastupdatetimestamp(lastUpdateTimestamp);
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -156,6 +156,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -190,6 +191,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -224,6 +226,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -258,6 +261,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -292,6 +296,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -326,6 +331,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -360,6 +366,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -394,6 +401,7 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -116,17 +116,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=10000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>8000</arg>
+            <arg>--numPartitions</arg><arg>10000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -143,17 +145,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>4000</arg>
+            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -170,11 +174,13 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
@ -197,17 +203,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>300</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -224,17 +232,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=200
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>200</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -251,17 +261,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/organization</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>400</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -278,17 +290,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/project</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
@ -305,17 +319,19 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/relation</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>10000</arg>
+            <arg>--numPartitions</arg><arg>15000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
@ -45,6 +45,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=15000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -79,6 +80,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=10000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql
@ -28,7 +28,8 @@ SELECT
    (array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1]              AS ecinternationalorganization,
    (array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1]                      AS ecenterprise,
    (array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1]                    AS ecsmevalidated,
-    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode
+    (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1]                       AS ecnutscode,
    org_types.name                                                                                              AS typology
 FROM organizations o
 	LEFT OUTER JOIN acronyms a    ON (a.id = o.id)
 	LEFT OUTER JOIN urls u        ON (u.id = o.id)
@ -37,6 +38,7 @@ FROM organizations o
 	LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
    LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
    LEFT OUTER JOIN other_ids idup  ON (od.id = idup.id)
    LEFT OUTER JOIN org_types ON (org_types.val = o.type)
 WHERE
    o.status = 'approved' OR o.status = 'suggested'
 GROUP BY
@ -44,4 +46,5 @@ GROUP BY
 	o.name,
 	o.creation_date,
 	o.modification_date,
-	o.country;
+	o.country,
 	org_types.name;
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@ -0,0 +1,5 @@
 [
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@ -0,0 +1,166 @@
 {
  "cites":{
    "original":"Cites",
    "inverse":"IsCitedBy"
  },
  "compiles":{
    "original":"Compiles",
    "inverse":"IsCompiledBy"
  },
  "continues":{
    "original":"Continues",
    "inverse":"IsContinuedBy"
  },
  "derives":{
    "original":"IsSourceOf",
    "inverse":"IsDerivedFrom"
  },
  "describes":{
    "original":"Describes",
    "inverse":"IsDescribedBy"
  },
  "documents":{
    "original":"Documents",
    "inverse":"IsDocumentedBy"
  },
  "hasmetadata":{
    "original":"HasMetadata",
    "inverse":"IsMetadataOf"
  },
  "hasassociationwith":{
    "original":"HasAssociationWith",
    "inverse":"HasAssociationWith"
  },
  "haspart":{
    "original":"HasPart",
    "inverse":"IsPartOf"
  },
  "hasversion":{
    "original":"HasVersion",
    "inverse":"IsVersionOf"
  },
  "iscitedby":{
    "original":"IsCitedBy",
    "inverse":"Cites"
  },
  "iscompiledby":{
    "original":"IsCompiledBy",
    "inverse":"Compiles"
  },
  "iscontinuedby":{
    "original":"IsContinuedBy",
    "inverse":"Continues"
  },
  "isderivedfrom":{
    "original":"IsDerivedFrom",
    "inverse":"IsSourceOf"
  },
  "isdescribedby":{
    "original":"IsDescribedBy",
    "inverse":"Describes"
  },
  "isdocumentedby":{
    "original":"IsDocumentedBy",
    "inverse":"Documents"
  },
  "isidenticalto":{
    "original":"IsIdenticalTo",
    "inverse":"IsIdenticalTo"
  },
  "ismetadatafor":{
    "original":"IsMetadataFor",
    "inverse":"IsMetadataOf"
  },
  "ismetadataof":{
    "original":"IsMetadataOf",
    "inverse":"IsMetadataFor"
  },
  "isnewversionof":{
    "original":"IsNewVersionOf",
    "inverse":"IsPreviousVersionOf"
  },
  "isobsoletedby":{
    "original":"IsObsoletedBy",
    "inverse":"Obsoletes"
  },
  "isoriginalformof":{
    "original":"IsOriginalFormOf",
    "inverse":"IsVariantFormOf"
  },
  "ispartof":{
    "original":"IsPartOf",
    "inverse":"HasPart"
  },
  "ispreviousversionof":{
    "original":"IsPreviousVersionOf",
    "inverse":"IsNewVersionOf"
  },
  "isreferencedby":{
    "original":"IsReferencedBy",
    "inverse":"References"
  },
  "isrelatedto":{
    "original":"IsRelatedTo",
    "inverse":"IsRelatedTo"
  },
  "isrequiredby":{
    "original":"IsRequiredBy",
    "inverse":"Requires"
  },
  "isreviewedby":{
    "original":"IsReviewedBy",
    "inverse":"Reviews"
  },
  "issourceof":{
    "original":"IsSourceOf",
    "inverse":"IsDerivedFrom"
  },
  "issupplementedby":{
    "original":"IsSupplementedBy",
    "inverse":"IsSupplementTo"
  },
  "issupplementto":{
    "original":"IsSupplementTo",
    "inverse":"IsSupplementedBy"
  },
  "isvariantformof":{
    "original":"IsVariantFormOf",
    "inverse":"IsOriginalFormOf"
  },
  "isversionof":{
    "original":"IsVersionOf",
    "inverse":"HasVersion"
  },
  "obsoletes":{
    "original":"Obsoletes",
    "inverse":"IsObsoletedBy"
  },
  "references":{
    "original":"References",
    "inverse":"IsReferencedBy"
  },
  "requires":{
    "original":"Requires",
    "inverse":"IsRequiredBy"
  },
  "related":{
    "original":"IsRelatedTo",
    "inverse":"IsRelatedTo"
  },
  "reviews":{
    "original":"Reviews",
    "inverse":"IsReviewedBy"
  },
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
  },
  "isamongtopnsimilardocuments": {
    "original": "IsAmongTopNSimilarDocuments",
    "inverse": "HasAmongTopNSimilarDocuments"
  },
  "hasamongtopnsimilardocuments": {
    "original": "HasAmongTopNSimilarDocuments",
    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
@ -25,6 +25,22 @@ object SparkApplyHostedByMapToResult {
          val i = p.getInstance().asScala
          if (i.size == 1) {
            val inst: Instance = i.head
            patchInstance(p, ei, inst)
          } else {
            val cf = i.map(ii => ii.getCollectedfrom.getValue)
            if (cf.contains("Crossref")) {
              i.foreach(ii => {
                patchInstance(p, ei, ii)
              })
            }
          }
        }
        p
      })(Encoders.bean(classOf[Publication]))
  }
  private def patchInstance(p: Publication, ei: EntityInfo, inst: Instance): Unit = {
    inst.getHostedby.setKey(ei.getHostedById)
    inst.getHostedby.setValue(ei.getName)
    if (ei.getOpenAccess) {
@ -39,11 +55,6 @@ object SparkApplyHostedByMapToResult {
      inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
      p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
    }
          }
        }
        p
      })(Encoders.bean(classOf[Publication]))
  }
  def main(args: Array[String]): Unit = {
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@ -0,0 +1,258 @@
 package eu.dnetlib.dhp.sx.graph
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
 import eu.dnetlib.dhp.schema.sx.scholix.{
  Scholix,
  ScholixCollectedFrom,
  ScholixEntityId,
  ScholixIdentifier,
  ScholixRelationship,
  ScholixResource
 }
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import scala.collection.JavaConverters._
 import scala.io.Source
 case class RelationInfo(
  source: String,
  target: String,
  relclass: String,
  id: String,
  collectedfrom: Seq[RelKeyValue]
 ) {}
 case class RelKeyValue(key: String, value: String) {}
 object ScholexplorerUtils {
  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
  val mapper = new ObjectMapper()
  case class RelationVocabulary(original: String, inverse: String) {}
  val relations: Map[String, RelationVocabulary] = {
    val input = Source
      .fromInputStream(
        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
      )
      .mkString
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input)
    json.extract[Map[String, RelationVocabulary]]
  }
  def invRel(rel: String): String = {
    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
    if (semanticRelation != null)
      semanticRelation.inverse
    else
      null
  }
  def generateDatasourceOpenAIREURLS(id: String): String = {
    if (id != null && id.length > 12)
      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
    else
      null
  }
  def findURLForPID(
    pidValue: List[StructuredProperty],
    urls: List[String]
  ): List[(StructuredProperty, String)] = {
    pidValue.map { p =>
      val pv = p.getValue
      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
      (p, r.orNull)
    }
  }
  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
    if (r.getInstance() == null || r.getInstance().isEmpty)
      return List()
    r.getInstance()
      .asScala
      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
      .filter(i => i.getPid != null && i.getUrl != null)
      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
      .distinct
      .toList
  }
  def generateScholixResourceFromResult(result: Result): ScholixResource = {
    if (result.getInstance() == null || result.getInstance().size() == 0)
      return null
    if (result.getPid == null || result.getPid.isEmpty)
      return null
    val r = new ScholixResource
    r.setDnetIdentifier(result.getId)
    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
    if (persistentIdentifiers.isEmpty)
      return null
    r.setIdentifier(persistentIdentifiers.asJava)
    r.setObjectType(result.getResulttype.getClassid)
    r.setObjectSubType(
      result
        .getInstance()
        .asScala
        .filter(i => i != null && i.getInstancetype != null)
        .map(i => i.getInstancetype.getClassname)
        .distinct
        .head
    )
    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
      if (titles.nonEmpty)
        r.setTitle(titles.head)
      else
        return null
    }
    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
      val authors: List[ScholixEntityId] =
        result.getAuthor.asScala
          .map(a => {
            val entity = new ScholixEntityId()
            entity.setName(a.getFullname)
            if (a.getPid != null && a.getPid.size() > 0)
              entity.setIdentifiers(
                a.getPid.asScala
                  .map(sp => {
                    val id = new ScholixIdentifier()
                    id.setIdentifier(sp.getValue)
                    id.setSchema(sp.getQualifier.getClassid)
                    id
                  })
                  .take(3)
                  .toList
                  .asJava
              )
            entity
          })
          .toList
      if (authors.nonEmpty)
        r.setCreator(authors.asJava)
    }
    val dt: List[String] = result
      .getInstance()
      .asScala
      .filter(i => i.getDateofacceptance != null)
      .map(i => i.getDateofacceptance.getValue)
      .toList
    if (dt.nonEmpty)
      r.setPublicationDate(dt.distinct.head)
    r.setPublisher(
      result
        .getInstance()
        .asScala
        .map(i => i.getHostedby)
        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
        .map(h => {
          val eid = new ScholixEntityId()
          eid.setName(h.getValue)
          val id = new ScholixIdentifier()
          id.setIdentifier(h.getKey)
          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
          eid.setIdentifiers(List(id).asJava)
          eid
        })
        .distinct
        .asJava
    )
    r.setCollectedFrom(
      result.getCollectedfrom.asScala
        .map(cf => {
          val scf = new ScholixCollectedFrom()
          scf.setProvisionMode("collected")
          scf.setCompletionStatus("complete")
          val eid = new ScholixEntityId()
          eid.setName(cf.getValue)
          val id = new ScholixIdentifier()
          id.setIdentifier(cf.getKey)
          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
          eid.setIdentifiers(List(id).asJava)
          scf.setProvider(eid)
          scf
        })
        .asJava
    )
    r
  }
  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
    val s: Scholix = new Scholix
    s.setSource(source)
    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
      s.setLinkprovider(
        relation.collectedfrom
          .map(cf => {
            val eid = new ScholixEntityId()
            eid.setName(cf.value)
            val id = new ScholixIdentifier()
            id.setIdentifier(cf.key)
            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
            eid.setIdentifiers(List(id).asJava)
            eid
          })
          .toList
          .asJava
      )
    else {
      val eid = new ScholixEntityId()
      eid.setName("OpenAIRE")
      val id = new ScholixIdentifier()
      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
      eid.setIdentifiers(List(id).asJava)
      s.setLinkprovider(List(eid).asJava)
    }
    s.setIdentifier(relation.id)
    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
    if (semanticRelation == null)
      return null
    s.setRelationship(
      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
    )
    s.setPublicationDate(source.getPublicationDate)
    s.setPublisher(source.getPublisher)
    val mockTarget = new ScholixResource
    mockTarget.setDnetIdentifier(relation.target)
    s.setTarget(mockTarget)
    s
  }
  def updateTarget(s: Scholix, t: ScholixResource): String = {
    s.setTarget(t)
    val spublishers: Seq[ScholixEntityId] =
      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
    val tpublishers: Seq[ScholixEntityId] =
      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
    s.setPublisher(mergedPublishers.asJava)
    mapper.writeValueAsString(s)
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@ -0,0 +1,141 @@
 package eu.dnetlib.dhp.sx.graph
 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.schema.oaf.{
  KeyValue,
  OtherResearchProduct,
  Publication,
  Relation,
  Result,
  Software,
  Dataset => OafDataset
 }
 import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
 import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
  /** Here all the spark applications runs this method
    * where the whole logic of the spark node is defined
    */
  override def run(): Unit = {
    val sourcePath = parser.get("sourcePath")
    log.info("sourcePath: {}", sourcePath)
    val targetPath = parser.get("targetPath")
    log.info("targetPath: {}", targetPath)
    generateBidirectionalRelations(sourcePath, targetPath, spark)
    generateScholixResource(sourcePath, targetPath, spark)
    generateScholix(targetPath, spark)
  }
  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
    val entityMap: Map[String, StructType] = Map(
      "publication"          -> Encoders.bean(classOf[Publication]).schema,
      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
      "software"             -> Encoders.bean(classOf[Software]).schema,
      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
    )
    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
    val resDs = spark.emptyDataset[ScholixResource]
    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
      println(s"adding ${item._1}")
      res.union(
        spark.read
          .schema(item._2)
          .json(s"$inputPath/${item._1}")
          .as[Result]
          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
          .filter(s => s != null)
      )
    })
    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
  }
  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
    val relSchema = Encoders.bean(classOf[Relation]).schema
    val relDF = spark.read
      .schema(relSchema)
      .json(s"$inputPath/relation")
      .where(
        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
      )
      .select("source", "target", "collectedfrom", "relClass")
    def invRel: String => String = { s =>
      ScholexplorerUtils.invRel(s)
    }
    import org.apache.spark.sql.functions.udf
    val inverseRelationUDF = udf(invRel)
    val inverseRelation = relDF.select(
      col("target").alias("source"),
      col("source").alias("target"),
      col("collectedfrom"),
      inverseRelationUDF(col("relClass")).alias("relClass")
    )
    val bidRel = inverseRelation
      .union(relDF)
      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
      .drop("collectedfrom")
      .withColumnRenamed("cf", "collectedfrom")
      .groupBy(col("id"))
      .agg(
        first("source").alias("source"),
        first("target").alias("target"),
        first("relClass").alias("relClass"),
        first("collectedfrom").alias("collectedfrom")
      )
    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
  }
  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
    import spark.implicits._
    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
    val scholix_one_verse = relations
      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
    val resourceTarget = relations
      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
    scholix_one_verse
      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
      .write
      .mode(SaveMode.Overwrite)
      .option("compression", "gzip")
      .text(s"$outputPath/scholix")
  }
 }
 object SparkCreateScholexplorerDump {
  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
  def main(args: Array[String]): Unit = {
    new SparkCreateScholexplorerDump(
      log = logger,
      args = args,
      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
    ).initialize().run()
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@ -0,0 +1,26 @@
 package eu.dnetlib.dhp.sx.graph.scholix
 import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
 import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.Test
 import org.objenesis.strategy.StdInstantiatorStrategy
 class ScholixGenerationTest {
  @Test
  def generateScholix(): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
    val app = new SparkCreateScholexplorerDump(null, null, null)
 //   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
 //    app.generateBidirectionalRelations(
 //      "/home/sandro/Downloads/scholix_sample/",
 //      "/home/sandro/Downloads/scholix/",
 //      spark
 //    )
    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
  }
 }
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -18,7 +18,7 @@
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
+                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
@ -59,12 +59,6 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -160,6 +154,26 @@
                    <groupId>org.apache.zookeeper</groupId>
                    <artifactId>zookeeper</artifactId>
                </exclusion>
                <exclusion>
                    <artifactId>ant</artifactId>
                    <groupId>org.apache.ant</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>antlr4-runtime</artifactId>
                    <groupId>org.antlr</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>woodstox-core</artifactId>
                    <groupId>com.fasterxml.woodstox</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>log4j</artifactId>
                    <groupId>*</groupId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.logging.log4j</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
@ -206,5 +220,90 @@
    </dependencies>
    <profiles>
        <profile>
            <id>spark-24</id>
            <activation>
                <activeByDefault>true</activeByDefault>
            </activation>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-3</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>spark-34</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-4</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>spark-35</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-4</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
    </profiles>
 </project>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java
@ -31,7 +31,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.provision.XmlConverterJob;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
@ -48,7 +47,7 @@ public class IrishOaiExporterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
+					IrishOaiExporterJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
 		parser.parseArgument(args);
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getTitle()
 						.stream()
 						.filter(t -> StringUtils.isNotBlank(t.getValue()))
 						.findFirst()
 						.map(StructuredProperty::getValue)
 						.ifPresent(
-							title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
+							title -> {
 								re.setTitle(title);
 								re
 									.getTitle()
 									.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
 							});
 				}
 				if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
 					result
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
 import static org.apache.spark.sql.functions.*;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.expressions.UserDefinedFunction;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -45,9 +37,9 @@ import scala.Tuple2;
 /**
 * XmlConverterJob converts the JoinedEntities as XML records
 */
-public class XmlConverterJob {
+public class PayloadConverterJob {
-	private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
+	private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);
 	public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
@ -56,8 +48,8 @@ public class XmlConverterJob {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					XmlConverterJob.class
+					PayloadConverterJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
+						.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
@ -72,6 +64,12 @@ public class XmlConverterJob {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		final Boolean validateXML = Optional
 			.ofNullable(parser.get("validateXML"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.FALSE);
 		log.info("validateXML: {}", validateXML);
 		final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
 		log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
@ -86,18 +84,19 @@ public class XmlConverterJob {
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			removeOutputDir(spark, outputPath);
-			convertToXml(
+			createPayloads(
 				spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
-				VocabularyGroup.loadVocsFromIS(isLookup));
+				VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
 		});
 	}
-	private static void convertToXml(
+	private static void createPayloads(
 		final SparkSession spark,
 		final String inputPath,
 		final String outputPath,
 		final ContextMapper contextMapper,
-		final VocabularyGroup vocabularies) {
+		final VocabularyGroup vocabularies,
 		final Boolean validateXML) {
 		final XmlRecordFactory recordFactory = new XmlRecordFactory(
 			prepareAccumulators(spark.sparkContext()),
@ -118,7 +117,7 @@ public class XmlConverterJob {
 			.as(Encoders.kryo(JoinedEntity.class))
 			.map(
 				(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
-					recordFactory.build(je),
+					recordFactory.build(je, validateXML),
 					ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
 			.map(
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Miriam Baglioni	c465835061	[Person]new implementation for the extraction of the coAuthorship relations	2024-07-09 12:29:55 +02:00
Miriam Baglioni	ddd20e7f8e	[Person]first implementation of the action set to include Person entity in the graph starting from the orcid data	2024-07-04 12:08:46 +02:00
Miriam Baglioni	67ff783e65	[Person]First implementation to include Person entity in the graph	2024-06-29 17:13:01 +02:00
Miriam Baglioni	d35edac212	[IrishFunderList]make changed according to 9635 comment 20, 21, 22 and 23	2024-06-20 12:28:28 +02:00
Miriam Baglioni	6421f8fece	Merge remote-tracking branch 'origin/beta' into beta	2024-06-19 11:12:15 +02:00
Miriam Baglioni	ac270f795b	[IrishFunderList]make changed according to 9635 comment 14, 15 and 16	2024-06-19 11:11:52 +02:00
Claudio Atzori	dd541f8cf5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster.' (#447 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#447	2024-06-18 15:52:30 +02:00
Lampros Smyrnaios	285416c74e	Merge branch 'beta' into beta	2024-06-18 13:50:38 +02:00
Lampros Smyrnaios	3095047e5e	Miscellaneous updates to the copying operation to Impala Cluster: - Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false". - Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR". - Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep". - Add error-handling for "compute stats" errors.	2024-06-18 14:40:41 +03:00
Antonis Lempesis	0456f1b788	Merge remote-tracking branch 'origin/beta' into beta	2024-06-14 15:11:30 +03:00
Antonis Lempesis	38636942c7	filtering out deletedbyinference and invinsible results from accessroute	2024-06-14 15:11:19 +03:00
Lampros Smyrnaios	d942a1101b	Miscellaneous updates to the copying operation to Impala Cluster: - Show some counts and the elapsed time for various sub-tasks. - Code polishing.	2024-06-14 12:14:38 +03:00
Giambattista Bloisi	9bf2bda1c6	Fix: next returned a null value at end of stream	2024-06-12 13:28:51 +02:00
Giambattista Bloisi	d90cb099b8	Fix for paginationStart parameter management	2024-06-11 20:23:44 +02:00
Claudio Atzori	11fe3a4fe0	[graph resolution] use sparkExecutorMemory to define also the memoryOverhead	2024-06-11 14:21:17 +02:00
Claudio Atzori	a8d68c9d29	avoid NPEs	2024-06-11 14:19:24 +02:00
Miriam Baglioni	8fe934810f	Merge remote-tracking branch 'origin/beta' into beta	2024-06-11 10:28:51 +02:00
Miriam Baglioni	9da006e98c	[SDGFoSActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance	2024-06-11 10:28:32 +02:00
Giambattista Bloisi	85c1eae7e0	Fixes for pagination strategy looping at end of download	2024-06-10 19:03:58 +02:00
Claudio Atzori	b0eba210c0	[actionset promotion] use sparkExecutorMemory to define also the memoryOverhead	2024-06-10 16:15:24 +02:00
Claudio Atzori	3776327a8c	hostedby patching to work with the updated Crossref contents, resolved conflict	2024-06-10 15:24:12 +02:00
Claudio Atzori	0139f23d66	Merge pull request 'organization type from OpenOrgs' (#445 ) from import_openorg_type into beta Reviewed-on: D-Net/dnet-hadoop#445	2024-06-07 12:17:31 +02:00
Michele Artini	c726572418	changed some parameters in OSF test	2024-06-07 12:03:26 +02:00
Claudio Atzori	ec79405cc9	[graph raw] set organization type from openorgs	2024-06-07 11:30:31 +02:00
Miriam Baglioni	1477406ecc	[bulkTag] fixed issue that made project disappear in graph_10_enriched	2024-06-06 10:45:41 +02:00
Claudio Atzori	92c3abd5a4	[graph cleaning] use sparkExecutorMemory to define also the memoryOverhead	2024-06-06 10:44:33 +02:00
Claudio Atzori	ce2364743a	applying changes from PR#442: Fix for missing collectedfrom after dedup	2024-06-06 10:43:43 +02:00
Claudio Atzori	f70dc76b61	minor	2024-06-06 10:43:10 +02:00
Claudio Atzori	73bd1938a5	[graph2hive] use sparkExecutorMemory to define also the memoryOverhead	2024-06-05 12:17:35 +02:00
Claudio Atzori	da5c1e73a4	Merge pull request 'Irish oaipmh exporter' (#443 ) from irish-oaipmh-exporter into beta Reviewed-on: D-Net/dnet-hadoop#443	2024-06-05 10:55:09 +02:00
Claudio Atzori	a02f3f0d2b	code formatting	2024-05-30 10:21:18 +02:00
Alessia Bardi	eadfd8d71d	Merge pull request 'Updated XMLIterator for splitting on different nodes' (#436 ) from dblp_collection_plugin into beta Reviewed-on: D-Net/dnet-hadoop#436	2024-05-29 16:05:06 +02:00
Alessia Bardi	05ee783c07	Merge branch 'beta' into dblp_collection_plugin	2024-05-29 16:04:39 +02:00
Alessia Bardi	fe9fb59c90	Merge pull request 'Rest collector plugin on hadoop supports a new param to pass request headers' (#441 ) from rest-collector-request-header-map into beta Reviewed-on: D-Net/dnet-hadoop#441	2024-05-29 15:54:39 +02:00
Claudio Atzori	c272c4ad68	code formatting	2024-05-29 15:50:07 +02:00
Alessia Bardi	c5f4da16a4	Merge branch 'beta' into rest-collector-request-header-map	2024-05-29 15:46:23 +02:00
Alessia	1b165a14a0	Rest collector plugin on hadoop supports a new param to pass request headers	2024-05-29 15:41:36 +02:00
Michele Artini	e996787be2	OSF test	2024-05-29 15:05:17 +02:00
Claudio Atzori	62716141c5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster' (#440 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#440	2024-05-29 14:34:51 +02:00
Miriam Baglioni	5d85b70e1f	[NOAMI] removed Ireland funder id 501100011103. ticket 9635	2024-05-29 11:55:00 +02:00
Lampros Smyrnaios	e3f28338c1	Miscellaneous updates to the copying operation to Impala Cluster: - Assign the WRITE and EXECUTE permissions to the DBs' HDFS-directories, in order to be able to create tables on top of them, in the Impala Cluster. - Make sure the "copydb" function returns early, when it encounters a fatal error, while respecting the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config.	2024-05-28 17:51:45 +03:00
Giambattista Bloisi	73316d8c83	Add jaxb and jaxws dependencies when compiling with spark-34 profile as they are required to run with jdk > 8	2024-05-28 14:14:51 +02:00
Miriam Baglioni	75d5ddb999	Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter	2024-05-27 12:01:28 +02:00
Miriam Baglioni	87c9c61b41	Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring	2024-05-27 12:01:16 +02:00
Miriam Baglioni	b55fed09f8	Update to include a blackList that filters out the results we know are wrongly associated to IE	2024-05-27 12:01:01 +02:00
Claudio Atzori	107d958b89	[org dedup] avoid NPEs in SparkPrepareNewOrgs	2024-05-27 11:59:54 +02:00
Claudio Atzori	3a7a6ecc32	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:45 +02:00
Claudio Atzori	1af4224d3d	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:33 +02:00
Claudio Atzori	0d5bdb2db0	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-27 11:59:02 +02:00
Claudio Atzori	66548e6a83	Merge pull request 'changes in copy script' (#438 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#438	2024-05-27 11:54:03 +02:00
Antonis Lempesis	15b54a345a	added fos lvl4	2024-05-24 13:21:28 +03:00
Lampros Smyrnaios	b48ed6e617	Change configuration in the copy-operation to Impala Cluster: Set the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" parameter to "false".	2024-05-23 16:58:12 +03:00
Lampros Smyrnaios	68322843e2	Small updates to the copy-operation to Impala Cluster: - Add a configuration-"switch" to control whether the script exits upon an error or not. - Allow the script to exit when a table could not be created. - Show the elapsed time for processing each database.	2024-05-23 15:07:49 +03:00
Lampros Smyrnaios	c7b32bbacc	Update CopyDataToImpalaCluster: Update the code of acquiring the entities from Ocean cluster, through hive, in order to optimize the process and account for additional reserved keywords in Impala. Co-authored-by: Antonis Lempesis <antleb@di.uoa.gr>	2024-05-23 13:00:19 +03:00
Giambattista Bloisi	1b2357e10a	Merge pull request 'Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12' (#327 ) from spark34-integration into beta Reviewed-on: D-Net/dnet-hadoop#327	2024-05-23 09:20:28 +02:00
Sandro La Bruzzo	f1fe363b19	merged again from beta (I hope for the last time)	2024-05-22 11:08:52 +02:00
Sandro La Bruzzo	66c1ffc866	merged again from beta (I hope for the last time)	2024-05-22 11:02:46 +02:00
Claudio Atzori	1ea67eba82	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-21 13:48:48 +02:00
Claudio Atzori	f9fb2fef6e	Merge pull request 'Modification of Microsoft Academic Graph Mapping' (#435 ) from mag_only_doi into beta Reviewed-on: D-Net/dnet-hadoop#435	2024-05-21 13:48:42 +02:00
Claudio Atzori	834461ba26	[graph provision]fixed wf definition, revised serialization of the usage counts measures	2024-05-21 13:48:06 +02:00
Sandro La Bruzzo	e8a61d5dd5	removed plugin, use only FileGZip plugin	2024-05-21 13:45:29 +02:00
Sandro La Bruzzo	ca9414b737	Implement multiple node name splitter on GZipCollectorPlugin and all nodes that use XMLIterator. If the splitter name contains is a comma separated values it splits for all the values	2024-05-21 09:11:13 +02:00
Sandro La Bruzzo	032bcc8279	since last beta workflow we decide to introduce in the graph only MAG item with DOI and set them invisible ( this should be the same behaviour of the previous DOIBoost mapping). This commit apply this type of mapping	2024-05-20 09:24:15 +02:00
Sandro La Bruzzo	103e2652b3	merged beta	2024-05-17 14:43:07 +02:00
Sandro La Bruzzo	a87f9ea643	fixed scholexplorer bug	2024-05-17 14:16:43 +02:00
Sandro La Bruzzo	6efab4d88e	fixed scholexplorer bug	2024-05-16 16:19:18 +02:00
Claudio Atzori	92f018d196	[graph provision] fixed path pointing to an intermediate data store in the working directory	2024-05-15 15:39:18 +02:00
Claudio Atzori	0611c81a2f	[graph provision] using Qualifier.classNames to populate the correponsing fields in the JSON payload	2024-05-15 15:33:10 +02:00
Claudio Atzori	1efe7f7e39	[graph provision] upgrade to dhp-schema:6.1.2, included project.oamandatepublications in the JSON payload mapping, fixed serialisation of the usageCounts measures	2024-05-14 12:39:31 +02:00
Claudio Atzori	53e7bb4336	Merge pull request 'rest-collector-plugin-with-retry' (#432 ) from rest-collector-plugin-with-retry into beta Reviewed-on: D-Net/dnet-hadoop#432	2024-05-10 09:02:33 +02:00
Claudio Atzori	f7d56e2ef2	Merge branch 'beta' into rest-collector-plugin-with-retry	2024-05-10 09:02:21 +02:00
Claudio Atzori	c1237ab39e	Merge pull request 'Fixes in Graph Provision' (#434 ) from beta_provision_relation into beta Reviewed-on: D-Net/dnet-hadoop#434	2024-05-09 14:15:05 +02:00
Claudio Atzori	dc3a5858f7	Merge branch 'beta' into beta_provision_relation	2024-05-09 14:14:43 +02:00
Claudio Atzori	55f39f7850	[graph provision] adds the possibility to validate the XML records before storing them via the validateXML parameter	2024-05-09 14:06:04 +02:00
Claudio Atzori	39a2afe8b5	[graph provision] fixed XML serialization of the usage counts measures, renamed workflow actions to better reflect their role	2024-05-09 13:54:42 +02:00
Claudio Atzori	908ed9da7a	Merge pull request 'Various fixes in the stats wf' (#430 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#430	2024-05-08 13:41:02 +02:00
Antonis Lempesis	0cada3cc8f	every step is run in the analytics queue. Hardcoded for now, will make a parameter later	2024-05-08 13:42:53 +03:00
Antonis Lempesis	90a4fb3547	fixed typos	2024-05-08 13:17:58 +03:00
Claudio Atzori	18aa323ee9	cleanup unused classes, adjustments in the oozie wf definition	2024-05-08 11:36:46 +02:00
Claudio Atzori	b4e3389432	fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting	2024-05-07 16:25:17 +02:00
Giambattista Bloisi	711048ceed	PrepareRelationsJob rewritten to use Spark Dataframe API and Windowing functions	2024-05-07 15:44:33 +02:00
Sandro La Bruzzo	db358ad0d2	code formatted	2024-05-02 15:25:57 +02:00
Sandro La Bruzzo	26bf8e763a	merged from beta	2024-05-02 15:20:23 +02:00
Sandro La Bruzzo	a860c57bbc	updated .gitignore	2024-05-02 15:16:00 +02:00
Sandro La Bruzzo	0646d0d064	Updated main sparkApplication to avoid to require master variable	2024-05-02 15:15:03 +02:00
Michele Artini	f4068de298	code reindent + tests	2024-05-02 09:51:33 +02:00
Michele Artini	2615136efc	added a retry mechanism	2024-04-30 11:58:42 +02:00
Sandro La Bruzzo	133ead1e3e	updated new version of scholexplorer Generation	2024-04-29 09:00:30 +02:00
Sandro La Bruzzo	052c6aac9d	formatted code	2024-04-26 16:03:04 +02:00
Sandro La Bruzzo	9cd3bc0f10	Added a new generation of the dump for scholexplorer tested with last version of spark, and strongly refactored	2024-04-26 16:02:07 +02:00
Sandro La Bruzzo	0d628cd62b	merged again from beta	2024-04-23 17:34:55 +02:00
Lampros Smyrnaios	49af2e5740	Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views; overcome some bash-instabilities. - Upon any error, fail the whole process, not just the current DB-creation, as those errors usually indicate a bug in the initial DB-creation, that should be fixed immediately. - Enhance parallel-copy of large files by "hadoop distcp" command. - Reduce the "invalidate metadata" commands to just the current DB's tables, in order to eliminate the general overhead on Impala. - Show the number of tables and views in the logs. - Fix some log-messages.	2024-04-23 17:15:04 +03:00
Antonis Lempesis	d2649a1429	increased the jvm ram	2024-04-23 16:03:16 +03:00
Sandro La Bruzzo	073f320c6a	Added module containing all the dependencies, useful for spark deploy on k8.	2024-04-22 11:32:31 +02:00
Sandro La Bruzzo	b84ad0c06e	merged beta	2024-04-19 14:39:59 +02:00
Antonis Lempesis	b52a5a753b	Merge remote-tracking branch 'upstream/beta' into beta	2024-04-19 15:28:28 +03:00
Sandro La Bruzzo	8dd9cf84e2	code formatted	2024-04-19 12:30:59 +02:00
Sandro La Bruzzo	342cb6189b	fixed problem on changed signature on RowEncoder removed property dhp.schema.artifact	2024-04-19 12:13:26 +02:00
Antonis Lempesis	c3fe9662b2	all indicator tables are now stored as parquet	2024-04-19 12:45:36 +03:00
Antonis Lempesis	0c71c58df6	fixed the definition of gold_oa	2024-04-18 12:01:27 +03:00
Antonis Lempesis	43d05dbebb	fixed the definition of result_country	2024-04-18 11:53:50 +03:00
Antonis Lempesis	e728a0897c	fixed the definition of indi_pub_bronze_oa	2024-04-18 11:07:55 +03:00
Antonis Lempesis	308ae580a9	slight optimization in indi_pub_gold_oa definition	2024-04-18 10:57:52 +03:00
Antonis Lempesis	27d22bd8f9	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:59:52 +03:00
Antonis Lempesis	1f5aba12fa	slight optimization in indi_pub_gold_oa definition	2024-04-17 23:54:23 +03:00
Giambattista Bloisi	613ec5ffce	Add profiles for different spark versions: spark-24, spark-34, spark-35	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	52495f2cd2	used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	8c3e9a09d3	added repository openaire-third-parties	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	2fa78f6071	Changes requires to build and run tests with Java 17	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	326c9dc08c	Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12	2023-12-05 19:11:06 +01:00
		`@ -0,0 +1,2 @@`
							`inputPath=/data/orcid_2023/tables/`
							`outputPath=/user/miriam.baglioni/peopleAS`