Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

2020-07-28 08:07:40 +02:00 · 2020-07-28 08:07:40 +02:00 · a6acb37689
parent ca37d3427b ee832f358e
commit a6acb37689
237 changed files with 7117 additions and 1930 deletions
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@ -14,6 +14,37 @@
    <description>This module contains common schema classes meant to be used across the dnet-hadoop submodules</description>
    <build>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>4.0.1</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <phase>initialize</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-test-compile</id>
                        <phase>process-test-resources</phase>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <dependencies>
        <dependency>
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
@ -1,8 +1,6 @@
 package eu.dnetlib.dhp.schema.common;
 import java.security.Key;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
@ -0,0 +1,90 @@
 package eu.dnetlib.dhp.schema.scholexplorer
 import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
 object OafUtils {
  def generateKeyValue(key: String, value: String): KeyValue = {
    val kv: KeyValue = new KeyValue()
    kv.setKey(key)
    kv.setValue(value)
    kv.setDataInfo(generateDataInfo("0.9"))
    kv
  }
  def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
    di.setInvisible(false)
    di.setTrust(trust)
    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
    di
  }
  def createQualifier(cls: String, sch: String): Qualifier = {
    createQualifier(cls, cls, sch, sch)
  }
  def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
    val q: Qualifier = new Qualifier
    q.setClassid(classId)
    q.setClassname(className)
    q.setSchemeid(schemeId)
    q.setSchemename(schemeName)
    q
  }
  def asField[T](value: T): Field[T] = {
    val tmp = new Field[T]
    tmp.setValue(value)
    tmp
  }
  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
    val sp = new StructuredProperty
    sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
    sp.setValue(value)
    sp
  }
  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
    val sp = new StructuredProperty
    sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
    sp.setValue(value)
    sp.setDataInfo(dataInfo)
    sp
  }
  def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
    val sp = new StructuredProperty
    sp.setQualifier(createQualifier(classId, schemeId))
    sp.setValue(value)
    sp
  }
  def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
    val sp = new StructuredProperty
    sp.setQualifier(createQualifier(classId, schemeId))
    sp.setValue(value)
    sp.setDataInfo(dataInfo)
    sp
  }
 }
--- a/dhp-workflows/dhp-broker-events/pom.xml
+++ b/dhp-workflows/dhp-broker-events/pom.xml
@ -1,5 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0"
 	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 	<parent>
 		<artifactId>dhp-workflows</artifactId>
 		<groupId>eu.dnetlib.dhp</groupId>
@ -24,6 +26,10 @@
 			<groupId>org.apache.spark</groupId>
 			<artifactId>spark-sql_2.11</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.elasticsearch</groupId>
 			<artifactId>elasticsearch-hadoop</artifactId>
 		</dependency>
 		<dependency>
@ -51,9 +57,8 @@
 		</dependency>
 		<dependency>
-			<groupId>eu.dnetlib</groupId>
+			<groupId>eu.dnetlib.dhp</groupId>
 			<artifactId>dnet-openaire-broker-common</artifactId>
 			<version>[3.0.3,4.0.0)</version>
 		</dependency>
 	</dependencies>
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
@ -11,6 +11,8 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.time.DateUtils;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
 public class EventFactory {
@ -32,7 +34,10 @@ public class EventFactory {
 		final MappedFields map = createMapFromResult(updateInfo);
 		final String eventId = calculateEventId(
-			updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString());
+			updateInfo.getTopicPath(), updateInfo.getTargetDs().getOpenaireId(), updateInfo
 				.getTarget()
 				.getOpenaireId(),
 			updateInfo.getHighlightValueAsString());
 		res.setEventId(eventId);
 		res.setProducerId(PRODUCER_ID);
@ -42,6 +47,7 @@ public class EventFactory {
 		res.setCreationDate(now);
 		res.setExpiryDate(calculateExpiryDate(now));
 		res.setInstantMessage(false);
 		return res;
 	}
@ -51,8 +57,11 @@ public class EventFactory {
 		final OaBrokerMainEntity source = updateInfo.getSource();
 		final OaBrokerMainEntity target = updateInfo.getTarget();
-		map.setTargetDatasourceId(target.getCollectedFromId());
+		final OaBrokerRelatedDatasource targetDs = updateInfo.getTargetDs();
-		map.setTargetDatasourceName(target.getCollectedFromName());
+
 		map.setTargetDatasourceId(targetDs.getOpenaireId());
 		map.setTargetDatasourceName(targetDs.getName());
 		map.setTargetDatasourceType(targetDs.getType());
 		map.setTargetResultId(target.getOpenaireId());
@ -71,18 +80,29 @@ public class EventFactory {
 		// PROVENANCE INFO
 		map.setTrust(updateInfo.getTrust());
 		map.setProvenanceDatasourceId(source.getCollectedFromId());
 		map.setProvenanceDatasourceName(source.getCollectedFromName());
 		map.setProvenanceResultId(source.getOpenaireId());
 		source
 			.getDatasources()
 			.stream()
 			.filter(ds -> ds.getRelType().equals(BrokerConstants.COLLECTED_FROM_REL))
 			.findFirst()
 			.ifPresent(ds -> {
 				map.setProvenanceDatasourceId(ds.getOpenaireId());
 				map.setProvenanceDatasourceName(ds.getName());
 				map.setProvenanceDatasourceType(ds.getType());
 			});
 		return map;
 	}
-	private static String calculateEventId(final String topic, final String publicationId, final String value) {
+	private static String calculateEventId(final String topic, final String dsId, final String publicationId,
 		final String value) {
 		return "event-"
-			+ DigestUtils.md5Hex(topic).substring(0, 6) + "-"
+			+ DigestUtils.md5Hex(topic).substring(0, 4) + "-"
-			+ DigestUtils.md5Hex(publicationId).substring(0, 8) + "-"
+			+ DigestUtils.md5Hex(dsId).substring(0, 4) + "-"
-			+ DigestUtils.md5Hex(value).substring(0, 8);
+			+ DigestUtils.md5Hex(publicationId).substring(0, 7) + "-"
 			+ DigestUtils.md5Hex(value).substring(0, 5);
 	}
 	private static long calculateExpiryDate(final long now) {
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java
@ -13,6 +13,7 @@ public class MappedFields implements Serializable {
 	private String targetDatasourceId;
 	private String targetDatasourceName;
 	private String targetDatasourceType;
 	private String targetResultId;
 	private String targetResultTitle;
 	private long targetDateofacceptance;
@ -21,6 +22,7 @@ public class MappedFields implements Serializable {
 	private float trust;
 	private String provenanceDatasourceId;
 	private String provenanceDatasourceName;
 	private String provenanceDatasourceType;
 	private String provenanceResultId;
 	public String getTargetDatasourceId() {
@ -39,6 +41,14 @@ public class MappedFields implements Serializable {
 		this.targetDatasourceName = targetDatasourceName;
 	}
 	public String getTargetDatasourceType() {
 		return targetDatasourceType;
 	}
 	public void setTargetDatasourceType(final String targetDatasourceType) {
 		this.targetDatasourceType = targetDatasourceType;
 	}
 	public String getTargetResultId() {
 		return targetResultId;
 	}
@ -103,6 +113,14 @@ public class MappedFields implements Serializable {
 		this.provenanceDatasourceName = provenanceDatasourceName;
 	}
 	public String getProvenanceDatasourceType() {
 		return provenanceDatasourceType;
 	}
 	public void setProvenanceDatasourceType(final String provenanceDatasourceType) {
 		this.provenanceDatasourceType = provenanceDatasourceType;
 	}
 	public String getProvenanceResultId() {
 		return provenanceResultId;
 	}
@ -111,4 +129,8 @@ public class MappedFields implements Serializable {
 		this.provenanceResultId = provenanceResultId;
 	}
 	public static long getSerialversionuid() {
 		return serialVersionUID;
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java
@ -0,0 +1,112 @@
 package eu.dnetlib.dhp.broker.oa;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.sql.expressions.Aggregator;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.model.Event;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import scala.Tuple2;
 public class CheckDuplictedIdsJob {
 	private static final Logger log = LoggerFactory.getLogger(CheckDuplictedIdsJob.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					CheckDuplictedIdsJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);
 		final SparkConf conf = new SparkConf();
 		final String eventsPath = parser.get("workingPath") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String countPath = parser.get("workingPath") + "/counts";
 		log.info("countPath: {}", countPath);
 		final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
 		final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
 		final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
 		ClusterUtils
 			.readPath(spark, eventsPath, Event.class)
 			.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
 			.groupByKey(t -> t._1, Encoders.STRING())
 			.agg(agg)
 			.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
 			.filter(t -> t._2 > 1)
 			.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.json(countPath);
 		;
 	}
 	private static String eventAsJsonString(final Event f) throws JsonProcessingException {
 		return new ObjectMapper().writeValueAsString(f);
 	}
 }
 class CountAggregator extends Aggregator<Tuple2<String, Long>, Tuple2<String, Long>, Tuple2<String, Long>> {
 	/**
 	 *
 	 */
 	private static final long serialVersionUID = 1395935985734672538L;
 	@Override
 	public Encoder<Tuple2<String, Long>> bufferEncoder() {
 		return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
 	}
 	@Override
 	public Tuple2<String, Long> finish(final Tuple2<String, Long> arg0) {
 		return arg0;
 	}
 	@Override
 	public Tuple2<String, Long> merge(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
 		final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
 		return new Tuple2<>(s, arg0._2 + arg1._2);
 	}
 	@Override
 	public Encoder<Tuple2<String, Long>> outputEncoder() {
 		return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
 	}
 	@Override
 	public Tuple2<String, Long> reduce(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
 		final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
 		return new Tuple2<>(s, arg0._2 + arg1._2);
 	}
 	@Override
 	public Tuple2<String, Long> zero() {
 		return new Tuple2<>(null, 0l);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java
@ -3,28 +3,28 @@ package eu.dnetlib.dhp.broker.oa;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.SparkContext;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.model.Event;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.EventFinder;
 import eu.dnetlib.dhp.broker.oa.util.EventGroup;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
 public class GenerateEventsJob {
@ -47,57 +47,54 @@ public class GenerateEventsJob {
 		final String workingPath = parser.get("workingPath");
 		log.info("workingPath: {}", workingPath);
 		final String isLookupUrl = parser.get("isLookupUrl");
 		log.info("isLookupUrl: {}", isLookupUrl);
 		final String dedupConfigProfileId = parser.get("dedupConfProfile");
 		log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
 		final String eventsPath = workingPath + "/events";
 		log.info("eventsPath: {}", eventsPath);
-		final SparkConf conf = new SparkConf();
+		final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
 		log.info("datasourceIdWhitelist: {}", StringUtils.join(dsIdWhitelist, ","));
-		// TODO UNCOMMENT
+		final Set<String> dsTypeWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceTypeWhitelist");
-		// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
+		log.info("datasourceTypeWhitelist: {}", StringUtils.join(dsTypeWhitelist, ","));
-		final DedupConfig dedupConfig = null;
+
 		final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
 		log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			ClusterUtils.removeDir(spark, eventsPath);
 			final Map<String, LongAccumulator> accumulators = prepareAccumulators(spark.sparkContext());
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
 			final Dataset<ResultGroup> groups = ClusterUtils
 				.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
-			final Dataset<Event> events = groups
+			final Dataset<Event> dataset = groups
 				.map(
-					(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
+					g -> EventFinder
-					Encoders.bean(EventGroup.class))
+						.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
-				.flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class));
+					Encoders
 						.bean(EventGroup.class))
 				.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
-			events.write().mode(SaveMode.Overwrite).json(eventsPath);
+			ClusterUtils.save(dataset, eventsPath, Event.class, total);
 		});
 	}
-	private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
+	public static Map<String, LongAccumulator> prepareAccumulators(final SparkContext sc) {
-		final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
+		return EventFinder
 			.getMatchers()
 			.stream()
 			.map(UpdateMatcher::accumulatorName)
 			.distinct()
 			.collect(Collectors.toMap(s -> s, s -> sc.longAccumulator(s)));
 		final String conf = isLookUpService
 			.getResourceProfileByQuery(
 				String
 					.format(
 						"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
 						profId));
 		final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
 		dedupConfig.getPace().initModel();
 		dedupConfig.getPace().initTranslationMap();
 		// dedupConfig.getWf().setConfigurationId("???");
 		return dedupConfig;
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java
@ -0,0 +1,63 @@
 package eu.dnetlib.dhp.broker.oa;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.TypedColumn;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.model.Event;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.DatasourceStats;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.StatsAggregator;
 public class GenerateStatsJob {
 	private static final Logger log = LoggerFactory.getLogger(GenerateStatsJob.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					IndexOnESJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final SparkConf conf = new SparkConf();
 		final String eventsPath = parser.get("workingPath") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String statsPath = parser.get("workingPath") + "/stats";
 		log.info("stats: {}", statsPath);
 		final TypedColumn<Event, DatasourceStats> aggr = new StatsAggregator().toColumn();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			final Dataset<DatasourceStats> stats = ClusterUtils
 				.readPath(spark, eventsPath, Event.class)
 				.groupByKey(e -> e.getMap().getTargetDatasourceId(), Encoders.STRING())
 				.agg(aggr)
 				.map(t -> t._2, Encoders.bean(DatasourceStats.class));
 			ClusterUtils.save(stats, statsPath, DatasourceStats.class, null);
 		});
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
@ -0,0 +1,71 @@
 package eu.dnetlib.dhp.broker.oa;
 import java.util.HashMap;
 import java.util.Map;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.model.Event;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 public class IndexOnESJob {
 	private static final Logger log = LoggerFactory.getLogger(IndexOnESJob.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					IndexOnESJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_es.json")));
 		parser.parseArgument(args);
 		final SparkConf conf = new SparkConf();
 		final String eventsPath = parser.get("workingPath") + "/events";
 		log.info("eventsPath: {}", eventsPath);
 		final String index = parser.get("index");
 		log.info("index: {}", index);
 		final String indexHost = parser.get("esHost");
 		log.info("indexHost: {}", indexHost);
 		final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
 		final JavaRDD<String> inputRdd = ClusterUtils
 			.readPath(spark, eventsPath, Event.class)
 			// .limit(10000) // TODO REMOVE
 			.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
 			.javaRDD();
 		final Map<String, String> esCfg = new HashMap<>();
 		// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
 		esCfg.put("es.batch.write.retry.count", "8");
 		esCfg.put("es.batch.write.retry.wait", "60s");
 		esCfg.put("es.batch.size.entries", "200");
 		esCfg.put("es.nodes.wan.only", "true");
 		JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
 	}
 	private static String eventAsJsonString(final Event f) throws JsonProcessingException {
 		return new ObjectMapper().writeValueAsString(f);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java
@ -0,0 +1,80 @@
 package eu.dnetlib.dhp.broker.oa;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasource;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasourceAggregator;
 import scala.Tuple2;
 public class JoinStep0Job {
 	private static final Logger log = LoggerFactory.getLogger(JoinStep0Job.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					JoinStep0Job.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
 		final String workingPath = parser.get("workingPath");
 		log.info("workingPath: {}", workingPath);
 		final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
 		log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			ClusterUtils.removeDir(spark, joinedEntitiesPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
 				.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
 			final Dataset<RelatedDatasource> typedRels = ClusterUtils
 				.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
 				.toColumn();
 			final Dataset<OaBrokerMainEntity> dataset = sources
 				.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
 				.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
 				.agg(aggr)
 				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
 			ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
 		});
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -52,8 +52,10 @@ public class JoinStep1Job {
 			ClusterUtils.removeDir(spark, joinedEntitiesPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
-				.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
+				.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
 			final Dataset<RelatedProject> typedRels = ClusterUtils
 				.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
@ -61,16 +63,15 @@ public class JoinStep1Job {
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
 				.toColumn();
-			sources
+			final Dataset<OaBrokerMainEntity> dataset = sources
 				.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
 				.groupByKey(
 					(MapFunction<Tuple2<OaBrokerMainEntity, RelatedProject>, String>) t -> t._1.getOpenaireId(),
 					Encoders.STRING())
 				.agg(aggr)
-				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
+				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
 				.json(joinedEntitiesPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java
@ -7,11 +7,10 @@ import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -52,6 +51,8 @@ public class JoinStep2Job {
 			ClusterUtils.removeDir(spark, joinedEntitiesPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
 				.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
@ -61,16 +62,13 @@ public class JoinStep2Job {
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
 				.toColumn();
-			sources
+			final Dataset<OaBrokerMainEntity> dataset = sources
 				.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
-				.groupByKey(
+				.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
 					(MapFunction<Tuple2<OaBrokerMainEntity, RelatedSoftware>, String>) t -> t._1.getOpenaireId(),
 					Encoders.STRING())
 				.agg(aggr)
-				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
+				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
 				.json(joinedEntitiesPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -52,6 +52,8 @@ public class JoinStep3Job {
 			ClusterUtils.removeDir(spark, joinedEntitiesPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
 				.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
@ -61,16 +63,15 @@ public class JoinStep3Job {
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
 				.toColumn();
-			sources
+			final Dataset<OaBrokerMainEntity> dataset = sources
 				.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
 				.groupByKey(
 					(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDataset>, String>) t -> t._1.getOpenaireId(),
 					Encoders.STRING())
 				.agg(aggr)
-				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
+				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
 				.json(joinedEntitiesPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -52,6 +52,8 @@ public class JoinStep4Job {
 			ClusterUtils.removeDir(spark, joinedEntitiesPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> sources = ClusterUtils
 				.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
@ -61,16 +63,15 @@ public class JoinStep4Job {
 			final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
 				.toColumn();
-			sources
+			final Dataset<OaBrokerMainEntity> dataset = sources
 				.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
 				.groupByKey(
 					(MapFunction<Tuple2<OaBrokerMainEntity, RelatedPublication>, String>) t -> t._1.getOpenaireId(),
 					Encoders.STRING())
 				.agg(aggr)
-				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
+				.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
 				.json(joinedEntitiesPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -57,6 +57,8 @@ public class PrepareGroupsJob {
 			ClusterUtils.removeDir(spark, groupsPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
 			final Dataset<OaBrokerMainEntity> results = ClusterUtils
 				.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
@ -67,20 +69,16 @@ public class PrepareGroupsJob {
 			final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
 				.toColumn();
-			final Dataset<ResultGroup> groups = results
+			final Dataset<ResultGroup> dataset = results
 				.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
 				.groupByKey(
 					(MapFunction<Tuple2<OaBrokerMainEntity, Relation>, String>) t -> t._2.getTarget(),
 					Encoders.STRING())
 				.agg(aggr)
-				.map(
+				.map(t -> t._2, Encoders.bean(ResultGroup.class))
 					(MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
 				.filter(rg -> rg.getData().size() > 1);
-			groups
+			ClusterUtils.save(dataset, groupsPath, ResultGroup.class, total);
 				.write()
 				.mode(SaveMode.Overwrite)
 				.json(groupsPath);
 		});
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -54,6 +54,8 @@ public class PrepareRelatedDatasetsJob {
 			ClusterUtils.removeDir(spark, relsPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
 			final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
 				.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
 				.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
@ -67,16 +69,15 @@ public class PrepareRelatedDatasetsJob {
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
-			rels
+			final Dataset<RelatedDataset> dataset = rels
 				.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(t -> {
 					final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
 					rel.getRelDataset().setRelType(t._1.getRelClass());
 					return rel;
-				}, Encoders.bean(RelatedDataset.class))
+				}, Encoders.bean(RelatedDataset.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, relsPath, RelatedDataset.class, total);
 				.json(relsPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java
@ -0,0 +1,107 @@
 package eu.dnetlib.dhp.broker.oa;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
 import eu.dnetlib.dhp.broker.oa.util.DatasourceRelationsAccumulator;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasource;
 import eu.dnetlib.dhp.schema.oaf.Datasource;
 import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.Software;
 import scala.Tuple3;
 public class PrepareRelatedDatasourcesJob {
 	private static final Logger log = LoggerFactory.getLogger(PrepareRelatedDatasourcesJob.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					PrepareRelatedDatasourcesJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String graphPath = parser.get("graphPath");
 		log.info("graphPath: {}", graphPath);
 		final String workingPath = parser.get("workingPath");
 		log.info("workingPath: {}", workingPath);
 		final String relsPath = workingPath + "/relatedDatasources";
 		log.info("relsPath: {}", relsPath);
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			ClusterUtils.removeDir(spark, relsPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_datasources");
 			final Dataset<Tuple3<String, String, String>> rels = prepareResultTuples(
 				spark, graphPath, Publication.class)
 					.union(prepareResultTuples(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class))
 					.union(prepareResultTuples(spark, graphPath, Software.class))
 					.union(prepareResultTuples(spark, graphPath, OtherResearchProduct.class));
 			final Dataset<OaBrokerRelatedDatasource> datasources = ClusterUtils
 				.readPath(spark, graphPath + "/datasource", Datasource.class)
 				.map(ConversionUtils::oafDatasourceToBrokerDatasource, Encoders.bean(OaBrokerRelatedDatasource.class));
 			final Dataset<RelatedDatasource> dataset = rels
 				.joinWith(datasources, datasources.col("openaireId").equalTo(rels.col("_2")), "inner")
 				.map(t -> {
 					final RelatedDatasource r = new RelatedDatasource();
 					r.setSource(t._1._1());
 					r.setRelDatasource(t._2);
 					r.getRelDatasource().setRelType(t._1._3());
 					return r;
 				}, Encoders.bean(RelatedDatasource.class));
 			ClusterUtils.save(dataset, relsPath, RelatedDatasource.class, total);
 		});
 	}
 	private static final Dataset<Tuple3<String, String, String>> prepareResultTuples(final SparkSession spark,
 		final String graphPath,
 		final Class<? extends Result> sourceClass) {
 		return ClusterUtils
 			.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
 			.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
 			.filter(r -> r.getDataInfo().getDeletedbyinference())
 			.map(
 				r -> DatasourceRelationsAccumulator.calculateTuples(r),
 				Encoders.bean(DatasourceRelationsAccumulator.class))
 			.flatMap(
 				acc -> acc.getRels().iterator(),
 				Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING()));
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -56,6 +56,8 @@ public class PrepareRelatedProjectsJob {
 			ClusterUtils.removeDir(spark, relsPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
 			final Dataset<OaBrokerProject> projects = ClusterUtils
 				.readPath(spark, graphPath + "/project", Project.class)
 				.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
@ -69,12 +71,12 @@ public class PrepareRelatedProjectsJob {
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
-			rels
+			final Dataset<RelatedProject> dataset = rels
 				.joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner")
-				.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class))
+				.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, relsPath, RelatedProject.class, total);
-				.json(relsPath);
+
 		});
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -55,6 +55,8 @@ public class PrepareRelatedPublicationsJob {
 			ClusterUtils.removeDir(spark, relsPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
 			final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
 				.readPath(spark, graphPath + "/publication", Publication.class)
 				.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
@ -70,16 +72,15 @@ public class PrepareRelatedPublicationsJob {
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
-			rels
+			final Dataset<RelatedPublication> dataset = rels
 				.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(t -> {
 					final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
 					rel.getRelPublication().setRelType(t._1.getRelClass());
 					return rel;
-				}, Encoders.bean(RelatedPublication.class))
+				}, Encoders.bean(RelatedPublication.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, relsPath, RelatedPublication.class, total);
 				.json(relsPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -56,6 +56,8 @@ public class PrepareRelatedSoftwaresJob {
 			ClusterUtils.removeDir(spark, relsPath);
 			final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
 			final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
 				.readPath(spark, graphPath + "/software", Software.class)
 				.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
@ -69,12 +71,11 @@ public class PrepareRelatedSoftwaresJob {
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
 				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
-			rels
+			final Dataset<RelatedSoftware> dataset = rels
 				.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
-				.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class))
+				.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, relsPath, RelatedSoftware.class, total);
 				.json(relsPath);
 		});
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
@ -9,8 +9,8 @@ import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -56,13 +56,14 @@ public class PrepareSimpleEntititiesJob {
 			ClusterUtils.removeDir(spark, simpleEntitiesPath);
-			prepareSimpleEntities(spark, graphPath, Publication.class)
+			final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
 			final Dataset<OaBrokerMainEntity> dataset = prepareSimpleEntities(spark, graphPath, Publication.class)
 				.union(prepareSimpleEntities(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class))
 				.union(prepareSimpleEntities(spark, graphPath, Software.class))
-				.union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class))
+				.union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class));
-				.write()
+
-				.mode(SaveMode.Overwrite)
+			ClusterUtils.save(dataset, simpleEntitiesPath, OaBrokerMainEntity.class, total);
 				.json(simpleEntitiesPath);
 		});
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java
@ -12,11 +12,12 @@ import java.util.stream.Collectors;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.util.LongAccumulator;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
 import eu.dnetlib.pace.config.DedupConfig;
 public abstract class UpdateMatcher<T> {
@ -34,20 +35,21 @@ public abstract class UpdateMatcher<T> {
 		this.highlightToStringFunction = highlightToStringFunction;
 	}
-	public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity res,
+	public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity target,
 		final OaBrokerRelatedDatasource targetDs,
 		final Collection<OaBrokerMainEntity> others,
-		final DedupConfig dedupConfig) {
+		final Map<String, LongAccumulator> accumulators) {
 		final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
 		for (final OaBrokerMainEntity source : others) {
-			if (source != res) {
+			if (source != target) {
-				for (final T hl : findDifferences(source, res)) {
+				for (final T hl : findDifferences(source, target)) {
 					final Topic topic = getTopicFunction().apply(hl);
 					if (topic != null) {
-						final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res,
+						final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, target, targetDs,
 							getCompileHighlightFunction(),
-							getHighlightToStringFunction(), dedupConfig);
+							getHighlightToStringFunction());
 						final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
 						if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
@ -67,9 +69,10 @@ public abstract class UpdateMatcher<T> {
 		if (values.isEmpty()) {
 			return new ArrayList<>();
 		} else if (values.size() > maxNumber) {
-			System.err.println("Too many events (" + values.size() + ") matched by " + getClass().getSimpleName());
+			incrementAccumulator(accumulators, maxNumber);
 			return values.subList(0, maxNumber);
 		} else {
 			incrementAccumulator(accumulators, values.size());
 			return values;
 		}
 	}
@ -80,8 +83,8 @@ public abstract class UpdateMatcher<T> {
 		return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
 	}
-	protected boolean isMissing(final String field) {
+	protected boolean isMissing(final String s) {
-		return StringUtils.isBlank(field);
+		return StringUtils.isBlank(s);
 	}
 	public int getMaxNumber() {
@ -100,4 +103,14 @@ public abstract class UpdateMatcher<T> {
 		return highlightToStringFunction;
 	}
 	public String accumulatorName() {
 		return "event_matcher_" + getClass().getSimpleName().toLowerCase();
 	}
 	public void incrementAccumulator(final Map<String, LongAccumulator> accumulators, final long n) {
 		if (accumulators != null && accumulators.containsKey(accumulatorName())) {
 			accumulators.get(accumulatorName()).add(n);
 		}
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
@ -25,6 +27,10 @@ public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBroke
 	protected final List<OaBrokerRelatedDataset> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getDatasets().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingDatasets = target
 			.getDatasets()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java
@ -15,7 +15,7 @@ public class EnrichMissingProject extends UpdateMatcher<OaBrokerProject> {
 		super(20,
 			prj -> Topic.ENRICH_MISSING_PROJECT,
 			(p, prj) -> p.getProjects().add(prj),
-			prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
+			prj -> prj.getOpenaireId());
 	}
 	@Override
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerProject;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
@ -16,27 +18,27 @@ public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
 		super(20,
 			prj -> Topic.ENRICH_MORE_PROJECT,
 			(p, prj) -> p.getProjects().add(prj),
-			prj -> projectAsString(prj));
+			prj -> prj.getOpenaireId());
 	}
 	private static String projectAsString(final OaBrokerProject prj) {
 		return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode();
 	}
 	@Override
 	protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingProjects = target
 			.getProjects()
 			.stream()
-			.map(EnrichMoreProject::projectAsString)
+			.map(p -> p.getOpenaireId())
 			.collect(Collectors.toSet());
 		return source
 			.getProjects()
 			.stream()
-			.filter(p -> !existingProjects.contains(projectAsString(p)))
+			.filter(p -> !existingProjects.contains(p.getOpenaireId()))
 			.collect(Collectors.toList());
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
@ -27,6 +29,10 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaB
 		final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getPublications().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingPublications = target
 			.getPublications()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
@ -24,6 +26,10 @@ public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
 		final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getSoftwares().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingSoftwares = source
 			.getSoftwares()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -11,6 +12,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
@ -25,6 +27,10 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
 	protected List<OaBrokerAuthor> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getCreators().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingOrcids = target
 			.getCreators()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
@ -23,6 +24,11 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
 	@Override
 	protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getInstances().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final long count = target
 			.getInstances()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java
@ -22,9 +22,8 @@ public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
 	@Override
 	protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		final long count = target.getPids().size();
-		if (count > 0) {
+		if (target.getPids().size() > 0) {
 			return Arrays.asList();
 		}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
@ -22,6 +24,11 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
 	@Override
 	protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getSubjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingSubject = target
 			.getSubjects()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -23,6 +24,11 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
 	@Override
 	protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getInstances().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> urls = target
 			.getInstances()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
@ -22,6 +24,11 @@ public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
 	@Override
 	protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getPids().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingPids = target
 			.getPids()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
@ -23,6 +25,10 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
 	protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target) {
 		if (target.getSubjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
 			return new ArrayList<>();
 		}
 		final Set<String> existingSubjects = target
 			.getSubjects()
 			.stream()
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java
@ -14,11 +14,19 @@ public class BrokerConstants {
 	public static final String OPEN_ACCESS = "OPEN";
 	public static final String IS_MERGED_IN_CLASS = "isMergedIn";
 	public static final String COLLECTED_FROM_REL = "collectedFrom";
 	public static final String HOSTED_BY_REL = "hostedBy";
 	public static final float MIN_TRUST = 0.25f;
 	public static final float MAX_TRUST = 1.00f;
 	public static final int MAX_NUMBER_OF_RELS = 20;
 	public static final int MAX_STRING_SIZE = 3000;
 	public static final int MAX_LIST_SIZE = 50;
 	public static Class<?>[] getModelClasses() {
 		final Set<Class<?>> list = new HashSet<>();
 		list.addAll(Arrays.asList(ModelSupport.getOafModelClasses()));
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@ -1,13 +1,21 @@
 package eu.dnetlib.dhp.broker.oa.util;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.util.LongAccumulator;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 public class ClusterUtils {
@ -44,4 +52,38 @@ public class ClusterUtils {
 			|| s.equals("isSupplementedTo");
 	}
 	public static <T> T incrementAccumulator(final T o, final LongAccumulator acc) {
 		if (acc != null) {
 			acc.add(1);
 		}
 		return o;
 	}
 	public static <T> void save(final Dataset<T> dataset,
 		final String path,
 		final Class<T> clazz,
 		final LongAccumulator acc) {
 		dataset
 			.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.json(path);
 	}
 	public static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
 		final String s = parser.get(key).trim();
 		final Set<String> res = new HashSet<>();
 		if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
 			Arrays
 				.stream(s.split(","))
 				.map(String::trim)
 				.filter(StringUtils::isNotBlank)
 				.forEach(res::add);
 		}
 		return res;
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
@ -22,11 +22,13 @@ import eu.dnetlib.broker.objects.OaBrokerJournal;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerProject;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
 import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.Datasource;
 import eu.dnetlib.dhp.schema.oaf.ExternalReference;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Instance;
@ -119,11 +121,10 @@ public class ConversionUtils {
 		res
 			.setJournal(
 				result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
 		res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
 		res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
 		res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
 		res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
-		res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
+		res
 			.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
 		return res;
 	}
@ -141,6 +142,7 @@ public class ConversionUtils {
 			.filter(pid -> pid.getQualifier().getClassid() != null)
 			.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
 			.map(pid -> pid.getValue())
 			.map(pid -> cleanOrcid(pid))
 			.filter(StringUtils::isNotBlank)
 			.findFirst()
 			.orElse(null) : null;
@ -148,6 +150,11 @@ public class ConversionUtils {
 		return new OaBrokerAuthor(author.getFullname(), pids);
 	}
 	private static String cleanOrcid(final String s) {
 		final String match = "//orcid.org/";
 		return s.contains(match) ? StringUtils.substringAfter(s, match) : s;
 	}
 	private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) {
 		if (journal == null) {
 			return null;
@ -216,6 +223,18 @@ public class ConversionUtils {
 		return res;
 	}
 	public static final OaBrokerRelatedDatasource oafDatasourceToBrokerDatasource(final Datasource ds) {
 		if (ds == null) {
 			return null;
 		}
 		final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
 		res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
 		res.setOpenaireId(ds.getId());
 		res.setType(classId(ds.getDatasourcetype()));
 		return res;
 	}
 	private static String first(final List<String> list) {
 		return list != null && list.size() > 0 ? list.get(0) : null;
 	}
@ -245,7 +264,13 @@ public class ConversionUtils {
 	private static List<String> fieldList(final List<Field<String>> fl) {
 		return fl != null
-			? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
+			? fl
 				.stream()
 				.map(Field::getValue)
 				.map(s -> StringUtils.abbreviate(s, BrokerConstants.MAX_STRING_SIZE))
 				.filter(StringUtils::isNotBlank)
 				.limit(BrokerConstants.MAX_LIST_SIZE)
 				.collect(Collectors.toList())
 			: new ArrayList<>();
 	}
@ -255,6 +280,7 @@ public class ConversionUtils {
 				.stream()
 				.map(StructuredProperty::getValue)
 				.filter(StringUtils::isNotBlank)
 				.limit(BrokerConstants.MAX_LIST_SIZE)
 				.collect(Collectors.toList())
 			: new ArrayList<>();
 	}
@ -280,6 +306,7 @@ public class ConversionUtils {
 			.stream()
 			.map(func::apply)
 			.filter(Objects::nonNull)
 			.limit(BrokerConstants.MAX_LIST_SIZE)
 			.collect(Collectors.toList());
 	}
@ -293,6 +320,7 @@ public class ConversionUtils {
 			.map(func::apply)
 			.flatMap(List::stream)
 			.filter(Objects::nonNull)
 			.limit(BrokerConstants.MAX_LIST_SIZE)
 			.collect(Collectors.toList());
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java
@ -0,0 +1,68 @@
 package eu.dnetlib.dhp.broker.oa.util;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
 import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import scala.Tuple3;
 public class DatasourceRelationsAccumulator implements Serializable {
 	/**
 	 *
 	 */
 	private static final long serialVersionUID = 3256220670651218957L;
 	private List<Tuple3<String, String, String>> rels = new ArrayList<>();
 	public List<Tuple3<String, String, String>> getRels() {
 		return rels;
 	}
 	public void setRels(final List<Tuple3<String, String, String>> rels) {
 		this.rels = rels;
 	}
 	protected void addTuple(final Tuple3<String, String, String> t) {
 		rels.add(t);
 	}
 	public static final DatasourceRelationsAccumulator calculateTuples(final Result r) {
 		final Set<String> collectedFromSet = r
 			.getCollectedfrom()
 			.stream()
 			.map(kv -> kv.getKey())
 			.filter(StringUtils::isNotBlank)
 			.distinct()
 			.collect(Collectors.toSet());
 		final Set<String> hostedBySet = r
 			.getInstance()
 			.stream()
 			.map(i -> i.getHostedby())
 			.filter(Objects::nonNull)
 			.filter(kv -> !StringUtils.equalsIgnoreCase(kv.getValue(), "Unknown Repository"))
 			.map(kv -> kv.getKey())
 			.filter(StringUtils::isNotBlank)
 			.distinct()
 			.filter(id -> !collectedFromSet.contains(id))
 			.collect(Collectors.toSet());
 		final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
 		collectedFromSet
 			.stream()
 			.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
 			.forEach(res::addTuple);
 		hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
 		return res;
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java
@ -3,11 +3,31 @@ package eu.dnetlib.dhp.broker.oa.util;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import org.apache.spark.util.LongAccumulator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.dhp.broker.model.EventFactory;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
 import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
 import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
 import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
 import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
@ -18,11 +38,12 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
 import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
 import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
 import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
 import eu.dnetlib.pace.config.DedupConfig;
 public class EventFinder {
-	private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
+	private static final Logger log = LoggerFactory.getLogger(EventFinder.class);
 	private static final List<UpdateMatcher<?>> matchers = new ArrayList<>();
 	static {
 		matchers.add(new EnrichMissingAbstract());
 		matchers.add(new EnrichMissingAuthorOrcid());
@ -34,40 +55,66 @@ public class EventFinder {
 		matchers.add(new EnrichMorePid());
 		matchers.add(new EnrichMoreSubject());
-		// // Advanced matchers
+		// Advanced matchers
 		matchers.add(new EnrichMissingProject());
-		// matchers.add(new EnrichMoreProject());
+		matchers.add(new EnrichMoreProject());
-		// matchers.add(new EnrichMissingSoftware());
+		matchers.add(new EnrichMissingSoftware());
-		// matchers.add(new EnrichMoreSoftware());
+		matchers.add(new EnrichMoreSoftware());
-		// matchers.add(new EnrichMissingPublicationIsRelatedTo());
+		matchers.add(new EnrichMissingPublicationIsRelatedTo());
-		// matchers.add(new EnrichMissingPublicationIsReferencedBy());
+		matchers.add(new EnrichMissingPublicationIsReferencedBy());
-		// matchers.add(new EnrichMissingPublicationReferences());
+		matchers.add(new EnrichMissingPublicationReferences());
-		// matchers.add(new EnrichMissingPublicationIsSupplementedTo());
+		matchers.add(new EnrichMissingPublicationIsSupplementedTo());
-		// matchers.add(new EnrichMissingPublicationIsSupplementedBy());
+		matchers.add(new EnrichMissingPublicationIsSupplementedBy());
-		// matchers.add(new EnrichMissingDatasetIsRelatedTo());
+		matchers.add(new EnrichMissingDatasetIsRelatedTo());
-		// matchers.add(new EnrichMissingDatasetIsReferencedBy());
+		matchers.add(new EnrichMissingDatasetIsReferencedBy());
-		// matchers.add(new EnrichMissingDatasetReferences());
+		matchers.add(new EnrichMissingDatasetReferences());
-		// matchers.add(new EnrichMissingDatasetIsSupplementedTo());
+		matchers.add(new EnrichMissingDatasetIsSupplementedTo());
-		// matchers.add(new EnrichMissingDatasetIsSupplementedBy());
+		matchers.add(new EnrichMissingDatasetIsSupplementedBy());
 		// matchers.add(new EnrichMissingAbstract());
 	}
-	public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {
+	public static EventGroup generateEvents(final ResultGroup results,
 		final Set<String> dsIdWhitelist,
 		final Set<String> dsIdBlacklist,
 		final Set<String> dsTypeWhitelist,
 		final Map<String, LongAccumulator> accumulators) {
 		final List<UpdateInfo<?>> list = new ArrayList<>();
 		for (final OaBrokerMainEntity target : results.getData()) {
 			for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
 				if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
 					for (final UpdateMatcher<?> matcher : matchers) {
-				list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig));
+						list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
 					}
 				}
 			}
 		}
 		return asEventGroup(list);
 	}
 	private static boolean verifyTarget(final OaBrokerRelatedDatasource target,
 		final Set<String> dsIdWhitelist,
 		final Set<String> dsIdBlacklist,
 		final Set<String> dsTypeWhitelist) {
 		if (dsIdWhitelist.contains(target.getOpenaireId())) {
 			return true;
 		} else if (dsIdBlacklist.contains(target.getOpenaireId())) {
 			return false;
 		} else {
 			return dsTypeWhitelist.contains(target.getType());
 		}
 	}
 	private static EventGroup asEventGroup(final List<UpdateInfo<?>> list) {
 		final EventGroup events = new EventGroup();
 		list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement);
 		return events;
 	}
 	public static List<UpdateMatcher<?>> getMatchers() {
 		return matchers;
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java
@ -1,8 +1,62 @@
 package eu.dnetlib.dhp.broker.oa.util;
 import java.io.IOException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.tree.support.TreeProcessor;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 public class TrustUtils {
 	private static final Logger log = LoggerFactory.getLogger(TrustUtils.class);
 	private static DedupConfig dedupConfig;
 	static {
 		final ObjectMapper mapper = new ObjectMapper();
 		try {
 			dedupConfig = mapper
 				.readValue(
 					DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
 					DedupConfig.class);
 		} catch (final IOException e) {
 			log.error("Error loading dedupConfig, e");
 		}
 	}
 	protected static float calculateTrust(final OaBrokerMainEntity r1, final OaBrokerMainEntity r2) {
 		if (dedupConfig == null) {
 			return BrokerConstants.MIN_TRUST;
 		}
 		try {
 			final ObjectMapper objectMapper = new ObjectMapper();
 			final MapDocument doc1 = MapDocumentUtil
 				.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
 			final MapDocument doc2 = MapDocumentUtil
 				.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
 			final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
 			final double threshold = dedupConfig.getWf().getThreshold();
 			return TrustUtils.rescale(score, threshold);
 		} catch (final Exception e) {
 			log.error("Error computing score between results", e);
 			return BrokerConstants.MIN_TRUST;
 		}
 	}
 	public static float rescale(final double score, final double threshold) {
 		if (score >= BrokerConstants.MAX_TRUST) {
 			return BrokerConstants.MAX_TRUST;
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java
@ -4,20 +4,12 @@ package eu.dnetlib.dhp.broker.oa.util;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.broker.objects.OaBrokerEventPayload;
 import eu.dnetlib.broker.objects.OaBrokerInstance;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerProvenance;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.tree.support.TreeProcessor;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 public final class UpdateInfo<T> {
@ -29,26 +21,27 @@ public final class UpdateInfo<T> {
 	private final OaBrokerMainEntity target;
 	private final OaBrokerRelatedDatasource targetDs;
 	private final BiConsumer<OaBrokerMainEntity, T> compileHighlight;
 	private final Function<T, String> highlightToString;
 	private final float trust;
 	private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
 	public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source,
 		final OaBrokerMainEntity target,
 		final OaBrokerRelatedDatasource targetDs,
 		final BiConsumer<OaBrokerMainEntity, T> compileHighlight,
-		final Function<T, String> highlightToString,
+		final Function<T, String> highlightToString) {
 		final DedupConfig dedupConfig) {
 		this.topic = topic;
 		this.highlightValue = highlightValue;
 		this.source = source;
 		this.target = target;
 		this.targetDs = targetDs;
 		this.compileHighlight = compileHighlight;
 		this.highlightToString = highlightToString;
-		this.trust = calculateTrust(dedupConfig, source, target);
+		this.trust = TrustUtils.calculateTrust(source, target);
 	}
 	public T getHighlightValue() {
@ -63,29 +56,8 @@ public final class UpdateInfo<T> {
 		return target;
 	}
-	private float calculateTrust(final DedupConfig dedupConfig,
+	public OaBrokerRelatedDatasource getTargetDs() {
-		final OaBrokerMainEntity r1,
+		return targetDs;
 		final OaBrokerMainEntity r2) {
 		if (dedupConfig == null) {
 			return BrokerConstants.MIN_TRUST;
 		}
 		try {
 			final ObjectMapper objectMapper = new ObjectMapper();
 			final MapDocument doc1 = MapDocumentUtil
 				.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
 			final MapDocument doc2 = MapDocumentUtil
 				.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
 			final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
 			final double threshold = dedupConfig.getWf().getThreshold();
 			return TrustUtils.rescale(score, threshold);
 		} catch (final Exception e) {
 			log.error("Error computing score between results", e);
 			return BrokerConstants.MIN_TRUST;
 		}
 	}
 	protected Topic getTopic() {
@ -112,7 +84,20 @@ public final class UpdateInfo<T> {
 		compileHighlight.accept(hl, getHighlightValue());
 		final String provId = getSource().getOpenaireId();
-		final String provRepo = getSource().getCollectedFromName();
+		final String provRepo = getSource()
 			.getDatasources()
 			.stream()
 			.filter(ds -> ds.getRelType().equals(BrokerConstants.COLLECTED_FROM_REL))
 			.map(ds -> ds.getName())
 			.findFirst()
 			.orElse("");
 		final String provType = getSource()
 			.getDatasources()
 			.stream()
 			.filter(ds -> ds.getRelType().equals(BrokerConstants.COLLECTED_FROM_REL))
 			.map(ds -> ds.getType())
 			.findFirst()
 			.orElse("");
 		final String provUrl = getSource()
 			.getInstances()
@ -122,7 +107,7 @@ public final class UpdateInfo<T> {
 			.orElse(null);
 		;
-		final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl);
+		final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provType, provUrl);
 		final OaBrokerEventPayload res = new OaBrokerEventPayload();
 		res.setResult(target);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/DatasourceStats.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/DatasourceStats.java
@ -0,0 +1,61 @@
 package eu.dnetlib.dhp.broker.oa.util.aggregators.stats;
 import java.io.Serializable;
 import java.util.HashMap;
 import java.util.Map;
 public class DatasourceStats implements Serializable {
 	/**
 	 *
 	 */
 	private static final long serialVersionUID = -282112564184047677L;
 	private String id;
 	private String name;
 	private String type;
 	private Map<String, Long> topics = new HashMap<>();
 	public String getId() {
 		return id;
 	}
 	public void setId(final String id) {
 		this.id = id;
 	}
 	public String getName() {
 		return name;
 	}
 	public void setName(final String name) {
 		this.name = name;
 	}
 	public String getType() {
 		return type;
 	}
 	public void setType(final String type) {
 		this.type = type;
 	}
 	public Map<String, Long> getTopics() {
 		return topics;
 	}
 	public void setTopics(final Map<String, Long> topics) {
 		this.topics = topics;
 	}
 	public void incrementTopic(final String topic, final long inc) {
 		if (topics.containsKey(topic)) {
 			topics.put(topic, topics.get(topic) + inc);
 		} else {
 			topics.put(topic, inc);
 		}
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java
@ -0,0 +1,59 @@
 package eu.dnetlib.dhp.broker.oa.util.aggregators.stats;
 import org.apache.commons.lang.StringUtils;
 import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.expressions.Aggregator;
 import eu.dnetlib.dhp.broker.model.Event;
 public class StatsAggregator extends Aggregator<Event, DatasourceStats, DatasourceStats> {
 	/**
 	 *
 	 */
 	private static final long serialVersionUID = 6652105853037330529L;
 	@Override
 	public DatasourceStats zero() {
 		return new DatasourceStats();
 	}
 	@Override
 	public DatasourceStats reduce(final DatasourceStats stats, final Event e) {
 		stats.setId(e.getMap().getTargetDatasourceId());
 		stats.setName(e.getMap().getTargetDatasourceName());
 		stats.setType(e.getMap().getTargetDatasourceType());
 		stats.incrementTopic(e.getTopic(), 1l);
 		return stats;
 	}
 	@Override
 	public DatasourceStats merge(final DatasourceStats stats0, final DatasourceStats stats1) {
 		if (StringUtils.isBlank(stats0.getId())) {
 			stats0.setId(stats1.getId());
 			stats0.setName(stats1.getName());
 			stats0.setType(stats1.getType());
 		}
 		stats1.getTopics().entrySet().forEach(e -> stats0.incrementTopic(e.getKey(), e.getValue()));
 		return stats0;
 	}
 	@Override
 	public Encoder<DatasourceStats> bufferEncoder() {
 		return Encoders.bean(DatasourceStats.class);
 	}
 	@Override
 	public DatasourceStats finish(final DatasourceStats stats) {
 		return stats;
 	}
 	@Override
 	public Encoder<DatasourceStats> outputEncoder() {
 		return Encoders.bean(DatasourceStats.class);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasource.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasource.java
@ -0,0 +1,42 @@
 package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
 import java.io.Serializable;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 public class RelatedDatasource implements Serializable {
 	/**
 	 *
 	 */
 	private static final long serialVersionUID = 3015550240920424010L;
 	private String source;
 	private OaBrokerRelatedDatasource relDatasource;
 	public RelatedDatasource() {
 	}
 	public RelatedDatasource(final String source, final OaBrokerRelatedDatasource relDatasource) {
 		this.source = source;
 		this.relDatasource = relDatasource;
 	}
 	public String getSource() {
 		return source;
 	}
 	public void setSource(final String source) {
 		this.source = source;
 	}
 	public OaBrokerRelatedDatasource getRelDatasource() {
 		return relDatasource;
 	}
 	public void setRelDatasource(final OaBrokerRelatedDatasource relDatasource) {
 		this.relDatasource = relDatasource;
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasourceAggregator.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasourceAggregator.java
@ -0,0 +1,69 @@
 package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.expressions.Aggregator;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
 import scala.Tuple2;
 public class RelatedDatasourceAggregator
 	extends Aggregator<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity, OaBrokerMainEntity> {
 	/**
 	 *
 	 */
 	private static final long serialVersionUID = -7212121913834713672L;
 	@Override
 	public OaBrokerMainEntity zero() {
 		return new OaBrokerMainEntity();
 	}
 	@Override
 	public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
 		return g;
 	}
 	@Override
 	public OaBrokerMainEntity reduce(final OaBrokerMainEntity g,
 		final Tuple2<OaBrokerMainEntity, RelatedDatasource> t) {
 		final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1;
 		if (t._2 != null && res.getDatasources().size() < BrokerConstants.MAX_NUMBER_OF_RELS) {
 			res.getDatasources().add(t._2.getRelDatasource());
 		}
 		return res;
 	}
 	@Override
 	public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
 		if (StringUtils.isNotBlank(g1.getOpenaireId())) {
 			final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getDatasources().size();
 			if (availables > 0) {
 				if (g2.getDatasources().size() <= availables) {
 					g1.getDatasources().addAll(g2.getDatasources());
 				} else {
 					g1.getDatasources().addAll(g2.getDatasources().subList(0, availables));
 				}
 			}
 			return g1;
 		} else {
 			return g2;
 		}
 	}
 	@Override
 	public Encoder<OaBrokerMainEntity> bufferEncoder() {
 		return Encoders.bean(OaBrokerMainEntity.class);
 	}
 	@Override
 	public Encoder<OaBrokerMainEntity> outputEncoder() {
 		return Encoders.bean(OaBrokerMainEntity.class);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json
@ -0,0 +1,122 @@
 {
 	"wf": {
 	},
 	"pace": {
 		"clustering": [
 			{
 				"name": "wordssuffixprefix",
 				"fields": [
 					"title"
 				],
 				"params": {
 					"max": "2",
 					"len": "3"
 				}
 			},
 			{
 				"name": "lowercase",
 				"fields": [
 					"doi"
 				],
 				"params": {
 				}
 			}
 		],
 		"decisionTree": {
 			"start": {
 				"fields": [
 					{
 						"field": "doi",
 						"comparator": "exactMatch",
 						"weight": 1.0,
 						"countIfUndefined": "false",
 						"params": {
 						}
 					}
 				],
 				"threshold": 0.5,
 				"aggregation": "AVG",
 				"positive": "MATCH",
 				"negative": "layer1",
 				"undefined": "layer1",
 				"ignoreUndefined": "true"
 			},
 			"layer1": {
 				"fields": [
 					{
 						"field": "title",
 						"comparator": "titleVersionMatch",
 						"weight": 0.9,
 						"countIfUndefined": "false",
 						"params": {
 						}
 					},
 					{
 						"field": "authors",
 						"comparator": "sizeMatch",
 						"weight": 0.9,
 						"countIfUndefined": "false",
 						"params": {
 						}
 					}
 				],
 				"threshold": 0.5,
 				"aggregation": "AVG",
 				"positive": "MATCH",
 				"negative": "layer2",
 				"undefined": "layer2",
 				"ignoreUndefined": "true"
 			},
 			"layer2": {
 				"fields": [
 					{
 						"field": "title",
 						"comparator": "levensteinTitle",
 						"weight": 1.0,
 						"countIfUndefined": "true",
 						"params": {
 						}
 					}
 				],
 				"threshold": 0.99,
 				"aggregation": "AVG",
 				"positive": "MATCH",
 				"negative": "NO_MATCH",
 				"undefined": "NO_MATCH",
 				"ignoreUndefined": "true"
 			}
 		},
 		"model": [
 			{
 				"name": "doi",
 				"type": "String",
 				"path": "$.pids[?(@.type == 'doi')].value"
 			},
 			{
 				"name": "title",
 				"type": "String",
 				"path": "$.titles",
 				"length": 250,
 				"size": 5
 			},
 			{
 				"name": "authors",
 				"type": "List",
 				"path": "$.creators[*].fullname",
 				"size": 200
 			}
 		],
 		"blacklists": {
 		},
 		"synonyms": {
 		}
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
@ -10,14 +10,28 @@
            <description>the path where the the generated data will be stored</description>
        </property>
 		<property>
-            <name>isLookupUrl</name>
+            <name>datasourceIdWhitelist</name>
-            <description>the address of the lookUp service</description>
+            <value>-</value>
            <description>a white list (comma separeted, - for empty list) of datasource ids</description>
        </property>
 		<property>
-            <name>dedupConfProfId</name>
+            <name>datasourceTypeWhitelist</name>
-            <description>the id of a valid Dedup Configuration Profile</description>
+            <value>-</value>
            <description>a white list (comma separeted, - for empty list) of datasource types</description>
        </property>
 		<property>
            <name>datasourceIdBlacklist</name>
            <value>-</value>
            <description>a black list (comma separeted, - for empty list) of datasource ids</description>
        </property>
        <property>
            <name>esIndexName</name>
            <description>the elasticsearch index name</description>
        </property>
        <property>
            <name>esIndexHost</name>
            <description>the elasticsearch host</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -89,6 +103,7 @@
    <fork name="start_entities_and_rels">
 		<path start="prepare_simple_entities"/>
 		<path start="prepare_related_datasources"/>
        <path start="prepare_related_softwares"/> 
        <path start="prepare_related_datasets"/>
        <path start="prepare_related_projects"/>
@ -119,6 +134,30 @@
        <error to="Kill"/>
    </action>
    <action name="prepare_related_datasources">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>PrepareRelatedDatasourcesJob</name>
            <class>eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasourcesJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
        </spark>
        <ok to="wait_entities_and_rels"/>
        <error to="Kill"/>
    </action>
    <action name="prepare_related_datasets">
        <spark xmlns="uri:oozie:spark-action:0.2">
@ -216,7 +255,31 @@
        <error to="Kill"/>
    </action>
-	<join name="wait_entities_and_rels" to="join_entities_step1"/>
+	<join name="wait_entities_and_rels" to="join_entities_step0"/>
    <action name="join_entities_step0">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>JoinStep0</name>
            <class>eu.dnetlib.dhp.broker.oa.JoinStep0Job</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
        </spark>
        <ok to="join_entities_step1"/>
        <error to="Kill"/>
    </action>
    <action name="join_entities_step1">
        <spark xmlns="uri:oozie:spark-action:0.2">
@ -356,8 +419,58 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+			<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
-            <arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
+			<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
 			<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
        </spark>
        <ok to="index_es"/>
        <error to="Kill"/>
    </action>
     <action name="index_es">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>IndexOnESJob</name>
            <class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.dynamicAllocation.maxExecutors="8" 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--index</arg><arg>${esIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
        </spark>
        <ok to="stats"/>
        <error to="Kill"/>
    </action>
    <action name="stats">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>GenerateStatsJob</name>
            <class>eu.dnetlib.dhp.broker.oa.GenerateStatsJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json
@ -6,15 +6,21 @@
 		"paramRequired": true
 	},
 	{
-		"paramName": "lu",
+		"paramName": "datasourceIdWhitelist",
-		"paramLongName": "isLookupUrl",
+		"paramLongName": "datasourceIdWhitelist",
-		"paramDescription": "the address of the ISLookUpService",
+		"paramDescription": "a white list (comma separeted, - for empty list) of datasource ids",
 		"paramRequired": true
 	},
 	{
-		"paramName": "d",
+		"paramName": "datasourceTypeWhitelist",
-		"paramLongName": "dedupConfProfile",
+		"paramLongName": "datasourceTypeWhitelist",
-		"paramDescription": "the id of a valid Dedup Configuration Profile",
+		"paramDescription": "a white list (comma separeted, - for empty list) of datasource types",
 		"paramRequired": true
 	},
 	{
 		"paramName": "datasourceIdBlacklist",
 		"paramLongName": "datasourceIdBlacklist",
 		"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
 		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json
@ -0,0 +1,20 @@
 [
 	{
 		"paramName": "o",
 		"paramLongName": "workingPath",
 		"paramDescription": "the workinh path",
 		"paramRequired": true
 	},
 	{
 		"paramName": "idx",
 		"paramLongName": "index",
 		"paramDescription": "the ES index",
 		"paramRequired": true
 	},
 	{
 		"paramName": "es",
 		"paramLongName": "esHost",
 		"paramDescription": "the ES host",
 		"paramRequired": true
 	}
 ]
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
@ -9,15 +9,6 @@
            <name>workingPath</name>
            <description>the path where the the generated data will be stored</description>
        </property>
        <property>
            <name>isLookupUrl</name>
            <description>the address of the lookUp service</description>
        </property>
        <property>
            <name>dedupConfProfId</name>
            <description>the id of a valid Dedup Configuration Profile</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -79,7 +70,6 @@
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="generate_events">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -98,8 +88,58 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+			<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
-            <arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
+			<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
 			<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
        </spark>
        <ok to="index_es"/>
        <error to="Kill"/>
    </action>
     <action name="index_es">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>IndexOnESJob</name>
            <class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.dynamicAllocation.maxExecutors="8" 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--index</arg><arg>${esIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
        </spark>
        <ok to="stats"/>
        <error to="Kill"/>
       </action>
    	<action name="stats">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>GenerateStatsJob</name>
            <class>eu.dnetlib.dhp.broker.oa.GenerateStatsJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java
@ -0,0 +1,133 @@
 package eu.dnetlib.dhp.broker.oa.matchers;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.Arrays;
 import java.util.Collection;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.junit.jupiter.MockitoExtension;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
 import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
 import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
@ExtendWith(MockitoExtension.class)
 class UpdateMatcherTest {
 	UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
 	@Mock
 	private OaBrokerRelatedDatasource targetDs;
 	@BeforeEach
 	void setUp() throws Exception {
 	}
 	@Test
 	void testSearchUpdatesForRecord_1() {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
 		final Collection<UpdateInfo<String>> list = matcher
 			.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testSearchUpdatesForRecord_2() {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
 		res.setPublicationdate("2018");
 		final Collection<UpdateInfo<String>> list = matcher
 			.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testSearchUpdatesForRecord_3() {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
 		p2.setPublicationdate("2018");
 		final Collection<UpdateInfo<String>> list = matcher
 			.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
 		assertTrue(list.size() == 1);
 	}
 	@Test
 	void testSearchUpdatesForRecord_4() {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
 		res.setPublicationdate("2018");
 		p2.setPublicationdate("2018");
 		final Collection<UpdateInfo<String>> list = matcher
 			.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testSearchUpdatesForRecord_5() {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
 		res.setPublicationdate("2018");
 		p1.setPublicationdate("2018");
 		p2.setPublicationdate("2018");
 		p3.setPublicationdate("2018");
 		p4.setPublicationdate("2018");
 		final Collection<UpdateInfo<String>> list = matcher
 			.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testSearchUpdatesForRecord_6() {
 		final OaBrokerMainEntity res = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
 		final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
 		p1.setPublicationdate("2018");
 		p2.setPublicationdate("2018");
 		p3.setPublicationdate("2018");
 		p4.setPublicationdate("2018");
 		final Collection<UpdateInfo<String>> list = matcher
 			.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
 		assertTrue(list.size() == 1);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java
@ -0,0 +1,57 @@
 package eu.dnetlib.dhp.broker.oa.matchers.simple;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.List;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 class EnrichMissingPublicationDateTest {
 	final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
 	@BeforeEach
 	void setUp() throws Exception {
 	}
 	@Test
 	void testFindDifferences_1() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		final List<String> list = matcher.findDifferences(source, target);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testFindDifferences_2() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		source.setPublicationdate("2018");
 		final List<String> list = matcher.findDifferences(source, target);
 		assertTrue(list.size() == 1);
 	}
 	@Test
 	void testFindDifferences_3() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		target.setPublicationdate("2018");
 		final List<String> list = matcher.findDifferences(source, target);
 		assertTrue(list.isEmpty());
 	}
 	@Test
 	void testFindDifferences_4() {
 		final OaBrokerMainEntity source = new OaBrokerMainEntity();
 		final OaBrokerMainEntity target = new OaBrokerMainEntity();
 		source.setPublicationdate("2018");
 		target.setPublicationdate("2018");
 		final List<String> list = matcher.findDifferences(source, target);
 		assertTrue(list.isEmpty());
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java
@ -5,6 +5,10 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.broker.objects.OaBrokerAuthor;
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
 public class TrustUtilsTest {
 	private static final double THRESHOLD = 0.95;
@ -64,6 +68,23 @@ public class TrustUtilsTest {
 		verifyValue(2.00, BrokerConstants.MAX_TRUST);
 	}
 	@Test
 	public void test() throws Exception {
 		final OaBrokerMainEntity r1 = new OaBrokerMainEntity();
 		r1.getTitles().add("D-NET Service Package: Data Import");
 		r1.getPids().add(new OaBrokerTypedValue("doi", "123"));
 		r1.getCreators().add(new OaBrokerAuthor("Michele Artini", null));
 		r1.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
 		final OaBrokerMainEntity r2 = new OaBrokerMainEntity();
 		r2.getTitles().add("D-NET Service Package: Data Import");
 		// r2.getPids().add(new OaBrokerTypedValue("doi", "123"));
 		r2.getCreators().add(new OaBrokerAuthor("Michele Artini", null));
 		// r2.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
 		System.out.println("TRUST: " + TrustUtils.calculateTrust(r1, r2));
 	}
 	private void verifyValue(final double originalScore, final float expectedTrust) {
 		final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
 		System.out.println(trust);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java
@ -28,6 +28,8 @@ import eu.dnetlib.pace.config.DedupConfig;
 abstract class AbstractSparkAction implements Serializable {
 	protected static final int NUM_PARTITIONS = 1000;
 	protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
 		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java
@ -100,6 +100,11 @@ public class DedupUtility {
 		return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
 	}
 	public static String createBlockStatsPath(
 		final String basePath, final String actionSetId, final String entityType) {
 		return String.format("%s/%s/%s_blockstats", basePath, actionSetId, entityType);
 	}
 	public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator)
 		throws ISLookUpException, DocumentException {
 		final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
@ -37,7 +37,7 @@ public class Deduper implements Serializable {
 	public static JavaPairRDD<String, Block> createSortedBlocks(
 		JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
 		final String of = config.getWf().getOrderField();
-		final int maxQueueSize = config.getWf().getGroupMaxSize();
+		final int maxQueueSize = config.getWf().getQueueMaxSize();
 		return mapDocs
 			// the reduce is just to be sure that we haven't document with same id
@ -52,6 +52,7 @@ public class Deduper implements Serializable {
 					.collect(Collectors.toList())
 					.iterator())
 			.mapToPair(block -> new Tuple2<>(block.getKey(), block))
-			.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
+			.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize))
 			.filter(b -> b._2().getDocuments().size() > 1);
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java
@ -0,0 +1,57 @@
 package eu.dnetlib.dhp.oa.dedup;
 import java.util.Objects;
 import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.expressions.Aggregator;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class RelationAggregator extends Aggregator<Relation, Relation, Relation> {
 	private static Relation ZERO = new Relation();
 	@Override
 	public Relation zero() {
 		return ZERO;
 	}
 	@Override
 	public Relation reduce(Relation b, Relation a) {
 		return mergeRel(b, a);
 	}
 	@Override
 	public Relation merge(Relation b, Relation a) {
 		return mergeRel(b, a);
 	}
 	@Override
 	public Relation finish(Relation r) {
 		return r;
 	}
 	private Relation mergeRel(Relation b, Relation a) {
 		if (Objects.equals(b, ZERO)) {
 			return a;
 		}
 		if (Objects.equals(a, ZERO)) {
 			return b;
 		}
 		b.mergeFrom(a);
 		return b;
 	}
 	@Override
 	public Encoder<Relation> bufferEncoder() {
 		return Encoders.kryo(Relation.class);
 	}
 	@Override
 	public Encoder<Relation> outputEncoder() {
 		return Encoders.kryo(Relation.class);
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java
@ -0,0 +1,126 @@
 package eu.dnetlib.dhp.oa.dedup;
 import java.io.IOException;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.dom4j.DocumentException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.dedup.model.Block;
 import eu.dnetlib.dhp.oa.dedup.model.BlockStats;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import scala.Tuple2;
 public class SparkBlockStats extends AbstractSparkAction {
 	private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class);
 	public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) {
 		super(parser, spark);
 	}
 	public static void main(String[] args) throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					SparkBlockStats.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json")));
 		parser.parseArgument(args);
 		SparkConf conf = new SparkConf();
 		new SparkBlockStats(parser, getSparkSession(conf))
 			.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
 	}
 	public Long computeComparisons(Long blockSize, Long slidingWindowSize) {
 		if (slidingWindowSize >= blockSize)
 			return (slidingWindowSize * (slidingWindowSize - 1)) / 2;
 		else {
 			return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2;
 		}
 	}
 	@Override
 	public void run(ISLookUpService isLookUpService)
 		throws DocumentException, IOException, ISLookUpException {
 		// read oozie parameters
 		final String graphBasePath = parser.get("graphBasePath");
 		final String isLookUpUrl = parser.get("isLookUpUrl");
 		final String actionSetId = parser.get("actionSetId");
 		final String workingPath = parser.get("workingPath");
 		final int numPartitions = Optional
 			.ofNullable(parser.get("numPartitions"))
 			.map(Integer::valueOf)
 			.orElse(NUM_PARTITIONS);
 		log.info("graphBasePath: '{}'", graphBasePath);
 		log.info("isLookUpUrl:   '{}'", isLookUpUrl);
 		log.info("actionSetId:   '{}'", actionSetId);
 		log.info("workingPath:   '{}'", workingPath);
 		// for each dedup configuration
 		for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
 			final String subEntity = dedupConf.getWf().getSubEntityValue();
 			log.info("Creating blockstats for: '{}'", subEntity);
 			final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity);
 			removeOutputDir(spark, outputPath);
 			JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 			JavaPairRDD<String, MapDocument> mapDocuments = sc
 				.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
 				.repartition(numPartitions)
 				.mapToPair(
 					(PairFunction<String, String, MapDocument>) s -> {
 						MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
 						return new Tuple2<>(d.getIdentifier(), d);
 					});
 			// create blocks for deduplication
 			JavaRDD<BlockStats> blockStats = Deduper
 				.createSortedBlocks(mapDocuments, dedupConf)
 				.repartition(numPartitions)
 				.map(b -> asBlockStats(dedupConf, b));
 			// save the blockstats in the workingdir
 			spark
 				.createDataset(blockStats.rdd(), Encoders.bean(BlockStats.class))
 				.write()
 				.mode(SaveMode.Overwrite)
 				.save(outputPath);
 		}
 	}
 	private BlockStats asBlockStats(DedupConfig dedupConf, Tuple2<String, Block> b) {
 		return new BlockStats(
 			b._1(),
 			(long) b._2().getDocuments().size(),
 			computeComparisons(
 				(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize()));
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -5,11 +5,13 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.graphx.Edge;
 import org.apache.spark.rdd.RDD;
@ -75,7 +77,11 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 		final String workingPath = parser.get("workingPath");
 		final String isLookUpUrl = parser.get("isLookUpUrl");
 		final String actionSetId = parser.get("actionSetId");
-
+		int cut = Optional
 			.ofNullable(parser.get("cutConnectedComponent"))
 			.map(Integer::valueOf)
 			.orElse(0);
 		log.info("connected component cut: '{}'", cut);
 		log.info("graphBasePath: '{}'", graphBasePath);
 		log.info("isLookUpUrl:   '{}'", isLookUpUrl);
 		log.info("actionSetId:   '{}'", actionSetId);
@ -100,8 +106,10 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			final RDD<Edge<String>> edgeRdd = spark
 				.read()
-				.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
+				.textFile(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
-				.as(Encoders.bean(Relation.class))
+				.map(
 					(MapFunction<String, Relation>) r -> OBJECT_MAPPER.readValue(r, Relation.class),
 					Encoders.bean(Relation.class))
 				.javaRDD()
 				.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
 				.rdd();
@ -109,7 +117,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			final Dataset<Relation> mergeRels = spark
 				.createDataset(
 					GraphProcessor
-						.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
+						.findCCs(vertexes.rdd(), edgeRdd, maxIterations, cut)
 						.toJavaRDD()
 						.filter(k -> k.getDocIds().size() > 1)
 						.flatMap(cc -> ccToMergeRel(cc, dedupConf))
@ -117,6 +125,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 					Encoders.bean(Relation.class));
 			mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath);
 		}
 	}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.oa.dedup;
 import java.io.IOException;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -48,13 +49,6 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 		parser.parseArgument(args);
 		SparkConf conf = new SparkConf();
 		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
 		conf
 			.registerKryoClasses(
 				new Class[] {
 					MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class
 				});
 		new SparkCreateSimRels(parser, getSparkSession(conf))
 			.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
 	}
@ -68,7 +62,12 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 		final String isLookUpUrl = parser.get("isLookUpUrl");
 		final String actionSetId = parser.get("actionSetId");
 		final String workingPath = parser.get("workingPath");
 		final int numPartitions = Optional
 			.ofNullable(parser.get("numPartitions"))
 			.map(Integer::valueOf)
 			.orElse(NUM_PARTITIONS);
 		log.info("numPartitions: '{}'", numPartitions);
 		log.info("graphBasePath: '{}'", graphBasePath);
 		log.info("isLookUpUrl:   '{}'", isLookUpUrl);
 		log.info("actionSetId:   '{}'", actionSetId);
@ -88,6 +87,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 			JavaPairRDD<String, MapDocument> mapDocuments = sc
 				.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
 				.repartition(numPartitions)
 				.mapToPair(
 					(PairFunction<String, String, MapDocument>) s -> {
 						MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
@ -95,19 +95,17 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 					});
 			// create blocks for deduplication
-			JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
+			JavaPairRDD<String, Block> blocks = Deduper
 				.createSortedBlocks(mapDocuments, dedupConf)
 				.repartition(numPartitions);
 			// create relations by comparing only elements in the same group
-			JavaRDD<Relation> relations = Deduper
+			Deduper
 				.computeRelations(sc, blocks, dedupConf)
-				.map(t -> createSimRel(t._1(), t._2(), entity));
+				.map(t -> createSimRel(t._1(), t._2(), entity))
-
+				.repartition(numPartitions)
-			// save the simrel in the workingdir
+				.map(r -> OBJECT_MAPPER.writeValueAsString(r))
-			spark
+				.saveAsTextFile(outputPath);
 				.createDataset(relations.rdd(), Encoders.bean(Relation.class))
 				.write()
 				.mode(SaveMode.Append)
 				.save(outputPath);
 		}
 	}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -4,7 +4,9 @@ package eu.dnetlib.dhp.oa.dedup;
 import static org.apache.spark.sql.functions.col;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
 import org.slf4j.Logger;
@ -95,7 +97,24 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 			FieldType.TARGET,
 			getDeletedFn());
-		save(newRels.union(updated).union(mergeRels), outputRelationPath, SaveMode.Overwrite);
+		save(
 			distinctRelations(
 				newRels
 					.union(updated)
 					.union(mergeRels)
 					.map((MapFunction<Relation, Relation>) r -> r, Encoders.kryo(Relation.class))),
 			outputRelationPath, SaveMode.Overwrite);
 	}
 	private Dataset<Relation> distinctRelations(Dataset<Relation> rels) {
 		return rels
 			.filter(getRelationFilterFunction())
 			.groupByKey(
 				(MapFunction<Relation, String>) r -> String
 					.join(r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()),
 				Encoders.STRING())
 			.agg(new RelationAggregator().toColumn())
 			.map((MapFunction<Tuple2<String, Relation>, Relation>) t -> t._2(), Encoders.bean(Relation.class));
 	}
 	private static Dataset<Relation> processDataset(
@ -112,6 +131,14 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 			.map(mapFn, Encoders.bean(Relation.class));
 	}
 	private FilterFunction<Relation> getRelationFilterFunction() {
 		return (FilterFunction<Relation>) r -> StringUtils.isNotBlank(r.getSource()) ||
 			StringUtils.isNotBlank(r.getTarget()) ||
 			StringUtils.isNotBlank(r.getRelClass()) ||
 			StringUtils.isNotBlank(r.getSubRelType()) ||
 			StringUtils.isNotBlank(r.getRelClass());
 	}
 	private static MapFunction<String, Relation> patchRelFn() {
 		return value -> {
 			final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.dedup.graph;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.commons.lang.StringUtils;
 import org.codehaus.jackson.annotate.JsonIgnore;
@ -18,12 +19,17 @@ public class ConnectedComponent implements Serializable {
 	private Set<String> docIds;
 	private String ccId;
-	public ConnectedComponent() {
+	public ConnectedComponent(Set<String> docIds, final int cut) {
 	}
 	public ConnectedComponent(Set<String> docIds) {
 		this.docIds = docIds;
 		createID();
 		if (cut > 0 && docIds.size() > cut) {
 			this.docIds = docIds
 				.stream()
 				.filter(s -> !ccId.equalsIgnoreCase(s))
 				.limit(cut - 1)
 				.collect(Collectors.toSet());
 			this.docIds.add(ccId);
 		}
 	}
 	public String createID() {
@ -41,6 +47,7 @@ public class ConnectedComponent implements Serializable {
 	public String getMin() {
 		final StringBuilder min = new StringBuilder();
 		docIds
 			.forEach(
 				i -> {
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala
@ -7,7 +7,7 @@ import scala.collection.JavaConversions;
 object GraphProcessor {
-  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
+  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int, cut:Int): RDD[ConnectedComponent] = {
    val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
    val cc = graph.connectedComponents(maxIterations).vertices
@ -22,15 +22,15 @@ object GraphProcessor {
      }
    }
    val connectedComponents = joinResult.groupByKey()
-      .map[ConnectedComponent](cc => asConnectedComponent(cc))
+      .map[ConnectedComponent](cc => asConnectedComponent(cc, cut))
    connectedComponents
  }
-  def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
+  def asConnectedComponent(group: (VertexId, Iterable[String]), cut:Int): ConnectedComponent = {
    val docs = group._2.toSet[String]
-    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
+    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs), cut);
    connectedComponent
  }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java
@ -0,0 +1,45 @@
 package eu.dnetlib.dhp.oa.dedup.model;
 import java.io.Serializable;
 public class BlockStats implements Serializable {
 	private String key; // key of the block
 	private Long size; // number of elements in the block
 	private Long comparisons; // number of comparisons in the block
 	public BlockStats() {
 	}
 	public BlockStats(String key, Long size, Long comparisons) {
 		this.key = key;
 		this.size = size;
 		this.comparisons = comparisons;
 	}
 	public String getKey() {
 		return key;
 	}
 	public void setKey(String key) {
 		this.key = key;
 	}
 	public Long getSize() {
 		return size;
 	}
 	public void setSize(Long size) {
 		this.size = size;
 	}
 	public Long getComparisons() {
 		return comparisons;
 	}
 	public void setComparisons(Long comparisons) {
 		this.comparisons = comparisons;
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json
@ -0,0 +1,32 @@
 [
  {
    "paramName": "la",
    "paramLongName": "isLookUpUrl",
    "paramDescription": "address for the LookUp",
    "paramRequired": true
  },
  {
    "paramName": "asi",
    "paramLongName": "actionSetId",
    "paramDescription": "action set identifier (name of the orchestrator)",
    "paramRequired": true
  },
  {
    "paramName": "i",
    "paramLongName": "graphBasePath",
    "paramDescription": "the base path of the raw graph",
    "paramRequired": true
  },
  {
    "paramName": "w",
    "paramLongName": "workingPath",
    "paramDescription": "path of the working directory",
    "paramRequired": true
  },
  {
    "paramName": "np",
    "paramLongName": "numPartitions",
    "paramDescription": "number of partitions for the similarity relations intermediate phases",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json
@ -17,6 +17,12 @@
    "paramDescription": "the url for the lookup service",
    "paramRequired": true
  },
  {
    "paramName": "cc",
    "paramLongName": "cutConnectedComponent",
    "paramDescription": "the number of maximum elements that belongs to a connected components",
    "paramRequired": false
  },
  {
    "paramName": "w",
    "paramLongName": "workingPath",
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json
@ -22,5 +22,11 @@
    "paramLongName": "workingPath",
    "paramDescription": "path of the working directory",
    "paramRequired": true
  },
  {
    "paramName": "np",
    "paramLongName": "numPartitions",
    "paramDescription": "number of partitions for the similarity relations intermediate phases",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
@ -20,6 +20,10 @@
            <name>dedupGraphPath</name>
            <description>path for the output graph</description>
        </property>
        <property>
            <name>cutConnectedComponent</name>
            <description>max number of elements in a connected component</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -106,10 +110,11 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
+            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
+            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="CreateMergeRel"/>
        <error to="Kill"/>
@ -132,10 +137,11 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
+            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
+            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
            <arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
        </spark>
        <ok to="CreateDedupRecord"/>
        <error to="Kill"/>
@ -158,10 +164,10 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
+            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
+            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
        </spark>
        <ok to="UpdateEntity"/>
        <error to="Kill"/>
@ -184,9 +190,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--o</arg><arg>${dedupGraphPath}</arg>
+            <arg>--dedupGraphPath</arg><arg>${dedupGraphPath}</arg>
        </spark>
        <ok to="copyRelations"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml
@ -0,0 +1,108 @@
 <workflow-app name="Create dedup blocks" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>graphBasePath</name>
            <description>the raw graph base path</description>
        </property>
        <property>
            <name>isLookUpUrl</name>
            <description>the address of the lookUp service</description>
        </property>
        <property>
            <name>actionSetId</name>
            <description>id of the actionSet</description>
        </property>
        <property>
            <name>numPartitions</name>
            <description>number of partitions for the similarity relations intermediate phases</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="CreateBlockStats"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="CreateBlockStats">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Create deduplication blocks</name>
            <class>eu.dnetlib.dhp.oa.dedup.SparkBlockStats</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
            <arg>--workingPath</arg><arg>${workingDir}</arg>
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -45,6 +45,17 @@ public class EntityMergerTest implements Serializable {
 	}
 	@Test
 	public void softwareMergerTest() throws InstantiationException, IllegalAccessException {
 		List<Tuple2<String, Software>> softwares = readSample(
 			testEntityBasePath + "/software_merge.json", Software.class);
 		Software merged = DedupRecordFactory
 			.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
 		assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
 	}
 	@Test
 	public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
@ -3,6 +3,8 @@ package eu.dnetlib.dhp.oa.dedup;
 import static java.nio.file.Files.createTempDirectory;
 import static org.apache.spark.sql.functions.col;
 import static org.apache.spark.sql.functions.count;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.mockito.Mockito.lenient;
@ -11,6 +13,9 @@ import java.io.IOException;
 import java.io.Serializable;
 import java.net.URISyntaxException;
 import java.nio.file.Paths;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
@ -18,6 +23,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Dataset;
@ -71,11 +77,13 @@ public class SparkDedupTest implements Serializable {
 		FileUtils.deleteDirectory(new File(testOutputBasePath));
 		FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
 		final SparkConf conf = new SparkConf();
 		conf.set("spark.sql.shuffle.partitions", "200");
 		spark = SparkSession
 			.builder()
 			.appName(SparkDedupTest.class.getSimpleName())
 			.master("local[*]")
-			.config(new SparkConf())
+			.config(conf)
 			.getOrCreate();
 		jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -152,33 +160,38 @@ public class SparkDedupTest implements Serializable {
 		parser
 			.parseArgument(
 				new String[] {
-					"-i",
+					"-i", testGraphBasePath,
-					testGraphBasePath,
+					"-asi", testActionSetId,
-					"-asi",
+					"-la", "lookupurl",
-					testActionSetId,
+					"-w", testOutputBasePath,
-					"-la",
+					"-np", "50"
 					"lookupurl",
 					"-w",
 					testOutputBasePath
 				});
 		new SparkCreateSimRels(parser, spark).run(isLookUpService);
 		long orgs_simrel = spark
 			.read()
-			.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
+			.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
 			.count();
 		long pubs_simrel = spark
 			.read()
-			.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
+			.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
 			.count();
 		long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count();
-		long ds_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count();
+		long sw_simrel = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
 			.count();
 		long ds_simrel = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
 			.count();
 		long orp_simrel = spark
 			.read()
-			.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
+			.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
 			.count();
 		assertEquals(3432, orgs_simrel);
@ -190,6 +203,101 @@ public class SparkDedupTest implements Serializable {
 	@Test
 	@Order(2)
 	public void cutMergeRelsTest() throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					SparkCreateMergeRels.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
 		parser
 			.parseArgument(
 				new String[] {
 					"-i",
 					testGraphBasePath,
 					"-asi",
 					testActionSetId,
 					"-la",
 					"lookupurl",
 					"-w",
 					testOutputBasePath,
 					"-cc",
 					"3"
 				});
 		new SparkCreateMergeRels(parser, spark).run(isLookUpService);
 		long orgs_mergerel = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
 			.as(Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
 			.groupBy("source")
 			.agg(count("target").alias("cnt"))
 			.select("source", "cnt")
 			.where("cnt > 3")
 			.count();
 		long pubs_mergerel = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
 			.as(Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
 			.groupBy("source")
 			.agg(count("target").alias("cnt"))
 			.select("source", "cnt")
 			.where("cnt > 3")
 			.count();
 		long sw_mergerel = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
 			.as(Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
 			.groupBy("source")
 			.agg(count("target").alias("cnt"))
 			.select("source", "cnt")
 			.where("cnt > 3")
 			.count();
 		long ds_mergerel = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
 			.as(Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
 			.groupBy("source")
 			.agg(count("target").alias("cnt"))
 			.select("source", "cnt")
 			.where("cnt > 3")
 			.count();
 		long orp_mergerel = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
 			.as(Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
 			.groupBy("source")
 			.agg(count("target").alias("cnt"))
 			.select("source", "cnt")
 			.where("cnt > 3")
 			.count();
 		assertEquals(0, orgs_mergerel);
 		assertEquals(0, pubs_mergerel);
 		assertEquals(0, sw_mergerel);
 		assertEquals(0, ds_mergerel);
 		assertEquals(0, orp_mergerel);
 		FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel"));
 		FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel"));
 		FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/software_mergerel"));
 		FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel"));
 		FileUtils
 			.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel"));
 	}
 	@Test
 	@Order(3)
 	public void createMergeRelsTest() throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -225,8 +333,10 @@ public class SparkDedupTest implements Serializable {
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
 			.count();
-
+		long ds_mergerel = spark
-		long ds_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count();
+			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
 			.count();
 		long orp_mergerel = spark
 			.read()
@ -241,7 +351,7 @@ public class SparkDedupTest implements Serializable {
 	}
 	@Test
-	@Order(3)
+	@Order(4)
 	public void createDedupRecordTest() throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -288,7 +398,7 @@ public class SparkDedupTest implements Serializable {
 	}
 	@Test
-	@Order(4)
+	@Order(5)
 	public void updateEntityTest() throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -404,7 +514,7 @@ public class SparkDedupTest implements Serializable {
 	}
 	@Test
-	@Order(5)
+	@Order(6)
 	public void propagateRelationTest() throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -423,7 +533,7 @@ public class SparkDedupTest implements Serializable {
 		long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
-		assertEquals(4975, relations);
+		assertEquals(4866, relations);
 		// check deletedbyinference
 		final Dataset<Relation> mergeRels = spark
@ -454,7 +564,7 @@ public class SparkDedupTest implements Serializable {
 	}
 	@Test
-	@Order(6)
+	@Order(7)
 	public void testRelations() throws Exception {
 		testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
 		testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java
@ -0,0 +1,177 @@
 package eu.dnetlib.dhp.oa.dedup;
 import static java.nio.file.Files.createTempDirectory;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.mockito.Mockito.lenient;
 import java.io.File;
 import java.io.IOException;
 import java.io.Serializable;
 import java.net.URISyntaxException;
 import java.nio.file.Paths;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.Mockito;
 import org.mockito.junit.jupiter.MockitoExtension;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
 public class SparkStatsTest implements Serializable {
 	@Mock(serializable = true)
 	ISLookUpService isLookUpService;
 	private static SparkSession spark;
 	private static JavaSparkContext jsc;
 	private static String testGraphBasePath;
 	private static String testOutputBasePath;
 	private static final String testActionSetId = "test-orchestrator";
 	@BeforeAll
 	public static void cleanUp() throws IOException, URISyntaxException {
 		testGraphBasePath = Paths
 			.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI())
 			.toFile()
 			.getAbsolutePath();
 		testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
 			.toAbsolutePath()
 			.toString();
 		FileUtils.deleteDirectory(new File(testOutputBasePath));
 		final SparkConf conf = new SparkConf();
 		conf.set("spark.sql.shuffle.partitions", "200");
 		spark = SparkSession
 			.builder()
 			.appName(SparkDedupTest.class.getSimpleName())
 			.master("local[*]")
 			.config(conf)
 			.getOrCreate();
 		jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 	}
 	@BeforeEach
 	public void setUp() throws IOException, ISLookUpException {
 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
 			.thenReturn(
 				IOUtils
 					.toString(
 						SparkDedupTest.class
 							.getResourceAsStream(
 								"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml")));
 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
 			.thenReturn(
 				IOUtils
 					.toString(
 						SparkDedupTest.class
 							.getResourceAsStream(
 								"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
 			.thenReturn(
 				IOUtils
 					.toString(
 						SparkDedupTest.class
 							.getResourceAsStream(
 								"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
 			.thenReturn(
 				IOUtils
 					.toString(
 						SparkDedupTest.class
 							.getResourceAsStream(
 								"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
 			.thenReturn(
 				IOUtils
 					.toString(
 						SparkDedupTest.class
 							.getResourceAsStream(
 								"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
 			.thenReturn(
 				IOUtils
 					.toString(
 						SparkDedupTest.class
 							.getResourceAsStream(
 								"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
 	}
 	@Test
 	public void createBlockStatsTest() throws Exception {
 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					SparkCreateSimRels.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json")));
 		parser
 			.parseArgument(
 				new String[] {
 					"-i", testGraphBasePath,
 					"-asi", testActionSetId,
 					"-la", "lookupurl",
 					"-w", testOutputBasePath
 				});
 		new SparkBlockStats(parser, spark).run(isLookUpService);
 		long orgs_blocks = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_blockstats")
 			.count();
 		long pubs_blocks = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_blockstats")
 			.count();
 		long sw_blocks = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/software_blockstats")
 			.count();
 		long ds_blocks = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_blockstats")
 			.count();
 		long orp_blocks = spark
 			.read()
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
 			.count();
 		assertEquals(121, orgs_blocks);
 		assertEquals(110, pubs_blocks);
 		assertEquals(21, sw_blocks);
 		assertEquals(67, ds_blocks);
 		assertEquals(55, orp_blocks);
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json
@ -6,10 +6,10 @@
    "subEntityType" : "resulttype",
    "subEntityValue" : "dataset",
    "orderField" : "title",
-    "queueMaxSize" : "2000",
+    "queueMaxSize" : "100",
    "groupMaxSize" : "100",
    "maxChildren" : "100",
-    "slidingWindowSize" : "200",
+    "slidingWindowSize" : "100",
    "rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
    "includeChildren" : "true",
    "idPath" : "$.id",
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json
@ -6,10 +6,10 @@
    "subEntityType" : "resulttype",
    "subEntityValue" : "otherresearchproduct",
    "orderField" : "title",
-    "queueMaxSize" : "2000",
+    "queueMaxSize" : "100",
    "groupMaxSize" : "100",
    "maxChildren" : "100",
-    "slidingWindowSize" : "200",
+    "slidingWindowSize" : "100",
    "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
    "includeChildren" : "true",
    "idPath" : "$.id",
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
@ -6,10 +6,10 @@
    "subEntityType": "resulttype",
    "subEntityValue": "publication",
    "orderField": "title",
-    "queueMaxSize": "2000",
+    "queueMaxSize": "100",
    "groupMaxSize": "100",
    "maxChildren": "100",
-    "slidingWindowSize": "200",
+    "slidingWindowSize": "100",
    "rootBuilder": [
      "result",
      "resultProject_outcome_isProducedBy",
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json
@ -6,10 +6,10 @@
      "subEntityType" : "resulttype",
      "subEntityValue" : "software",
      "orderField" : "title",
-      "queueMaxSize" : "2000",
+      "queueMaxSize" : "100",
      "groupMaxSize" : "100",
      "maxChildren" : "100",
-      "slidingWindowSize" : "200",
+      "slidingWindowSize" : "100",
      "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
      "includeChildren" : "true",
      "idPath" : "$.id",
@ -19,7 +19,7 @@
      "clustering" : [
        { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
        { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
-        { "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
+        { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
      ],
      "decisionTree": {
        "start": {
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/software_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/software_merge.json
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -9,6 +9,37 @@
    <artifactId>dhp-graph-mapper</artifactId>
    <build>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>4.0.1</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <phase>initialize</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-test-compile</id>
                        <phase>process-test-resources</phase>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <dependencies>
        <dependency>
@ -61,6 +92,13 @@
            <groupId>org.postgresql</groupId>
            <artifactId>postgresql</artifactId>
        </dependency>
        <dependency>
            <groupId>org.json4s</groupId>
            <artifactId>json4s-jackson_2.11</artifactId>
            <version>3.5.3</version>
        </dependency>
    </dependencies>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
 import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -97,7 +98,7 @@ public class CleanGraphSparkJob {
 			.json(outputPath);
 	}
-	private static <T extends Oaf> T fixDefaults(T value) {
+	protected static <T extends Oaf> T fixDefaults(T value) {
 		if (value instanceof Datasource) {
 			// nothing to clean here
 		} else if (value instanceof Project) {
@ -134,11 +135,6 @@ public class CleanGraphSparkJob {
 					.setResourcetype(
 						qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
 			}
 			if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
 				r
 					.setBestaccessright(
 						qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
 			}
 			if (Objects.nonNull(r.getInstance())) {
 				for (Instance i : r.getInstance()) {
 					if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
@ -152,6 +148,16 @@ public class CleanGraphSparkJob {
 					}
 				}
 			}
 			if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
 				Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
 				if (Objects.isNull(bestaccessrights)) {
 					r
 						.setBestaccessright(
 							qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
 				} else {
 					r.setBestaccessright(bestaccessrights);
 				}
 			}
 			if (Objects.nonNull(r.getAuthor())) {
 				boolean nullRank = r
 					.getAuthor()
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
@ -9,6 +9,7 @@ import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
@ -42,6 +43,12 @@ public class GraphHiveTableImporterJob {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		int numPartitions = Optional
 			.ofNullable(parser.get("numPartitions"))
 			.map(Integer::valueOf)
 			.orElse(-1);
 		log.info("numPartitions: {}", numPartitions);
 		String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);
@ -60,16 +67,21 @@ public class GraphHiveTableImporterJob {
 		conf.set("hive.metastore.uris", hiveMetastoreUris);
 		runWithSparkHiveSession(
-			conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz));
+			conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz, numPartitions));
 	}
 	// protected for testing
 	private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
-		Class<T> clazz) {
+		Class<T> clazz, int numPartitions) {
-		spark
+		Dataset<String> dataset = spark.read().textFile(inputPath);
-			.read()
+
-			.textFile(inputPath)
+		if (numPartitions > 0) {
 			log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
 			dataset = dataset.repartition(numPartitions);
 		}
 		dataset
 			.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java
@ -0,0 +1,162 @@
 package eu.dnetlib.dhp.oa.graph.merge;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Objects;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import scala.Tuple2;
 /**
 * Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids
 * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
 * by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
 */
 public class MergeGraphSparkJob {
 	private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				CleanGraphSparkJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		String priority = Optional
 			.ofNullable(parser.get("priority"))
 			.orElse(PRIORITY_DEFAULT);
 		log.info("priority: {}", priority);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		String betaInputPath = parser.get("betaInputPath");
 		log.info("betaInputPath: {}", betaInputPath);
 		String prodInputPath = parser.get("prodInputPath");
 		log.info("prodInputPath: {}", prodInputPath);
 		String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		String graphTableClassName = parser.get("graphTableClassName");
 		log.info("graphTableClassName: {}", graphTableClassName);
 		Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
 		SparkConf conf = new SparkConf();
 		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
 		conf.registerKryoClasses(ModelSupport.getOafModelClasses());
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, outputPath);
 				mergeGraphTable(spark, priority, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath);
 			});
 	}
 	private static <P extends Oaf, B extends Oaf> void mergeGraphTable(
 		SparkSession spark,
 		String priority,
 		String betaInputPath,
 		String prodInputPath,
 		Class<P> p_clazz,
 		Class<B> b_clazz,
 		String outputPath) {
 		Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz);
 		Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz);
 		prod
 			.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer")
 			.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
 				Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
 				Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
 				switch (priority) {
 					default:
 					case "BETA":
 						return mergeWithPriorityToBETA(p, b);
 					case "PROD":
 						return mergeWithPriorityToPROD(p, b);
 				}
 			}, Encoders.bean(p_clazz))
 			.filter((FilterFunction<P>) Objects::nonNull)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(outputPath);
 	}
 	private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
 		if (b.isPresent() & !p.isPresent()) {
 			return (P) b.get();
 		}
 		if (p.isPresent()) {
 			return p.get();
 		}
 		return null;
 	}
 	private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToBETA(Optional<P> p, Optional<B> b) {
 		if (p.isPresent() & !b.isPresent()) {
 			return p.get();
 		}
 		if (b.isPresent()) {
 			return (P) b.get();
 		}
 		return null;
 	}
 	private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath(
 		SparkSession spark, String inputEntityPath, Class<T> clazz) {
 		log.info("Reading Graph table from: {}", inputEntityPath);
 		return spark
 			.read()
 			.textFile(inputEntityPath)
 			.map(
 				(MapFunction<String, Tuple2<String, T>>) value -> {
 					final T t = OBJECT_MAPPER.readValue(value, clazz);
 					final String id = ModelSupport.idFn().apply(t);
 					return new Tuple2<>(id, t);
 				},
 				Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
 	}
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -1,36 +1,10 @@
 package eu.dnetlib.dhp.oa.graph.raw;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.keyValue;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
 import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import org.apache.commons.lang3.StringUtils;
 import org.dom4j.Document;
@ -40,24 +14,8 @@ import org.dom4j.Node;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.LicenseComparator;
-import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Context;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.GeoLocation;
 import eu.dnetlib.dhp.schema.oaf.Instance;
 import eu.dnetlib.dhp.schema.oaf.Journal;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.Software;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 public abstract class AbstractMdRecordToOafMapper {
@ -99,7 +57,6 @@ public abstract class AbstractMdRecordToOafMapper {
 			final Document doc = DocumentHelper
 				.parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));
 			final String type = doc.valueOf("//dr:CobjCategory/@type");
 			final KeyValue collectedFrom = getProvenanceDatasource(
 				doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
@ -118,12 +75,39 @@ public abstract class AbstractMdRecordToOafMapper {
 			final DataInfo info = prepareDataInfo(doc, invisible);
 			final long lastUpdateTimestamp = new Date().getTime();
-			return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+			final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
 			final String type = getResultType(doc, instances);
 			return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
 		} catch (final Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
 	protected String getResultType(final Document doc, final List<Instance> instances) {
 		String type = doc.valueOf("//dr:CobjCategory/@type");
 		if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 			String instanceType = instances
 				.stream()
 				.map(i -> i.getInstancetype().getClassid())
 				.findFirst()
 				.map(s -> UNKNOWN.equalsIgnoreCase(s) ? "0000" : s)
 				.orElse("0000"); // Unknown
 			return Optional
 				.ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
 				.map(q -> q.getClassid())
 				.orElse("0000");
 			/*
 			 * .orElseThrow( () -> new IllegalArgumentException( String.format("'%s' not mapped in %s", instanceType,
 			 * DNET_RESULT_TYPOLOGIES)));
 			 */
 		}
 		return type;
 	}
 	private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
 		final String dsId = doc.valueOf(xpathId);
 		final String dsName = doc.valueOf(xpathName);
@ -138,8 +122,8 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected List<Oaf> createOafs(
 		final Document doc,
 		final String type,
 		final List<Instance> instances,
 		final KeyValue collectedFrom,
 		final KeyValue hostedBy,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
@ -148,14 +132,14 @@ public abstract class AbstractMdRecordToOafMapper {
 		switch (type.toLowerCase()) {
 			case "publication":
 				final Publication p = new Publication();
-				populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
 				p.setJournal(prepareJournal(doc, info));
 				oafs.add(p);
 				break;
 			case "dataset":
 				final Dataset d = new Dataset();
-				populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
 				d.setStoragedate(prepareDatasetStorageDate(doc, info));
 				d.setDevice(prepareDatasetDevice(doc, info));
@ -168,7 +152,7 @@ public abstract class AbstractMdRecordToOafMapper {
 				break;
 			case "software":
 				final Software s = new Software();
-				populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
 				s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
 				s.setLicense(prepareSoftwareLicenses(doc, info));
@ -180,7 +164,7 @@ public abstract class AbstractMdRecordToOafMapper {
 			case "otherresearchproducts":
 			default:
 				final OtherResearchProduct o = new OtherResearchProduct();
-				populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				o.setResulttype(ORP_DEFAULT_RESULTTYPE);
 				o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
 				o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
@ -259,14 +243,16 @@ public abstract class AbstractMdRecordToOafMapper {
 	private void populateResultFields(
 		final Result r,
 		final Document doc,
 		final List<Instance> instances,
 		final KeyValue collectedFrom,
 		final KeyValue hostedBy,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
 		r.setDataInfo(info);
 		r.setLastupdatetimestamp(lastUpdateTimestamp);
 		r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
-		r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
+
 		r.setOriginalId(Arrays.asList(findOriginalId(doc)));
 		r.setCollectedfrom(Arrays.asList(collectedFrom));
 		r.setPid(prepareResultPids(doc, info));
 		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
@ -291,7 +277,7 @@ public abstract class AbstractMdRecordToOafMapper {
 		r.setCoverage(prepareCoverages(doc, info));
 		r.setContext(prepareContexts(doc, info));
 		r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
-		final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
+
 		r.setInstance(instances);
 		r.setBestaccessright(getBestAccessRights(instances));
 	}
@ -378,6 +364,10 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
 	public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
 		return getBestAccessRights(instanceList);
 	}
 	protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
 		if (instanceList != null) {
 			final Optional<Qualifier> min = instanceList
@ -425,6 +415,18 @@ public abstract class AbstractMdRecordToOafMapper {
 		return null;
 	}
 	private String findOriginalId(final Document doc) {
 		final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
 		if (n != null) {
 			final String id = n.valueOf("./*[local-name()='identifier']");
 			if (StringUtils.isNotBlank(id)) {
 				return id;
 			}
 		}
 		return doc.valueOf("//*[local-name()='header']/*[local-name()='identifier']");
 	}
 	protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
 		return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
@ -4,7 +4,11 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.function.Function;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
@ -57,6 +61,7 @@ public class OafMapperUtils {
 			.stream(values)
 			.map(v -> field(v, info))
 			.filter(Objects::nonNull)
 			.filter(distinctByKey(f -> f.getValue()))
 			.collect(Collectors.toList());
 	}
@ -65,6 +70,7 @@ public class OafMapperUtils {
 			.stream()
 			.map(v -> field(v, info))
 			.filter(Objects::nonNull)
 			.filter(distinctByKey(f -> f.getValue()))
 			.collect(Collectors.toList());
 	}
@ -237,4 +243,10 @@ public class OafMapperUtils {
 	public static String asString(final Object o) {
 		return o == null ? "" : o.toString();
 	}
 	public static <T> Predicate<T> distinctByKey(
 		final Function<? super T, ?> keyExtractor) {
 		final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
 		return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
@ -0,0 +1,89 @@
 package eu.dnetlib.dhp.sx.ebi
 import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
 import org.apache.spark.sql.{Encoder, Encoders}
 import org.apache.spark.sql.expressions.Aggregator
 object EBIAggregator {
  def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
    override def zero: OafDataset = new OafDataset()
    override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
      b.mergeFrom(a._2)
      if (b.getId == null)
        b.setId(a._2.getId)
      b
    }
    override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
      wx.mergeFrom(wy)
      if(wx.getId == null && wy.getId.nonEmpty)
        wx.setId(wy.getId)
      wx
    }
    override def finish(reduction: OafDataset): OafDataset = reduction
    override def bufferEncoder: Encoder[OafDataset] =
      Encoders.kryo(classOf[OafDataset])
    override def outputEncoder: Encoder[OafDataset] =
      Encoders.kryo(classOf[OafDataset])
  }
  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
    override def zero: Publication = new Publication()
    override def reduce(b: Publication, a: (String, Publication)): Publication = {
      b.mergeFrom(a._2)
      if (b.getId == null)
        b.setId(a._2.getId)
      b
    }
    override def merge(wx: Publication, wy: Publication): Publication = {
      wx.mergeFrom(wy)
      if(wx.getId == null && wy.getId.nonEmpty)
        wx.setId(wy.getId)
      wx
    }
    override def finish(reduction: Publication): Publication = reduction
    override def bufferEncoder: Encoder[Publication] =
      Encoders.kryo(classOf[Publication])
    override def outputEncoder: Encoder[Publication] =
      Encoders.kryo(classOf[Publication])
  }
  def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
    override def zero: Relation = new Relation()
    override def reduce(b: Relation, a: (String, Relation)): Relation = {
      a._2
    }
    override def merge(a: Relation, b: Relation): Relation = {
      if(b!= null) b else a
    }
    override def finish(reduction: Relation): Relation = reduction
    override def bufferEncoder: Encoder[Relation] =
      Encoders.kryo(classOf[Relation])
    override def outputEncoder: Encoder[Relation] =
      Encoders.kryo(classOf[Relation])
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
@ -0,0 +1,138 @@
 package eu.dnetlib.dhp.sx.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf}
 import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo}
 import eu.dnetlib.dhp.utils.DHPUtils
 import eu.dnetlib.scholexplorer.relation.RelationMapper
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
 import scala.collection.JavaConverters._
 object SparkAddLinkUpdates {
  val relationMapper = RelationMapper.load
 case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
  def generatePubmedDLICollectedFrom(): KeyValue = {
    OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
  }
  def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
    val pmid :String = input._1
    val input_json :String = input._2
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input_json)
    val targets:List[EBILinks] = for {
      JObject(link) <- json \\ "Category" \\ "Link"
      JField("PublicationDate", JString(pubdate)) <- link
      JField("RelationshipType", JObject(relationshipType)) <- link
      JField("Name", JString(relname)) <- relationshipType
      JField("Target", JObject(target)) <- link
      JField("Identifier", JObject(identifier)) <- target
      JField("ID", JString(tpid)) <- identifier
      JField("IDScheme", JString(tpidtype)) <- identifier
      JField("IDURL", JString(turl)) <- identifier
      JField("Title", JString(title)) <- target
      JField("Publisher", JObject(pub)) <- target
      JField("Name", JString(publisher)) <- pub
    } yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
    val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
    targets.flatMap(l => {
      val relation = new DLIRelation
      val inverseRelation = new DLIRelation
      val targetDnetId =  s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
      val relInfo = relationMapper.get(l.relation.toLowerCase)
      val relationSemantic = relInfo.getOriginal
      val inverseRelationSemantic = relInfo.getInverse
      relation.setSource(dnetPublicationId)
      relation.setTarget(targetDnetId)
      relation.setRelClass("datacite")
      relation.setRelType(relationSemantic)
      relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
      inverseRelation.setSource(targetDnetId)
      inverseRelation.setTarget(dnetPublicationId)
      inverseRelation.setRelClass("datacite")
      inverseRelation.setRelType(inverseRelationSemantic)
      inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
      val d = new DLIDataset
      d.setId(targetDnetId)
      d.setDataInfo(OafUtils.generateDataInfo())
      d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava)
      d.setCompletionStatus("complete")
      val pi = new ProvenaceInfo
      pi.setId("dli_________::europe_pmc__")
      pi.setName( "Europe PMC")
      pi.setCompletionStatus("complete")
      pi.setCollectionMode("collected")
      d.setDlicollectedfrom(List(pi).asJava)
      d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
      d.setPublisher(OafUtils.asField(l.publisher))
      d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava)
      d.setDateofacceptance(OafUtils.asField(l.pubdate))
      val i = new Instance
      i.setCollectedfrom(generatePubmedDLICollectedFrom())
      i.setDateofacceptance(d.getDateofacceptance)
      i.setUrl(List(l.turl).asJava)
      i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
      d.setInstance(List(i).asJava)
      List(relation, inverseRelation, d)
    })
  }
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val workingPath = parser.get("workingPath")
    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
    implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation])
    implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
    val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
    ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
    ds.filter(s => s.isInstanceOf)
    val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
    oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
    oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala
@ -0,0 +1,49 @@
 package eu.dnetlib.dhp.sx.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
 import scala.io.Source
 import scala.xml.pull.XMLEventReader
 object SparkCreateBaselineDataFrame {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val sc = spark.sparkContext
    val workingPath = parser.get("workingPath")
    implicit  val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
    implicit  val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
    implicit  val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
    val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
    val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
      val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
      new PMParser(xml)
    } ))
    ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
@ -0,0 +1,87 @@
 package eu.dnetlib.dhp.sx.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
 import eu.dnetlib.scholexplorer.relation.RelationMapper
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
 import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
 object SparkCreateEBIDataFrame {
  def main(args: Array[String]): Unit = {
    val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val sc = spark.sparkContext
    val workingPath = parser.get("workingPath")
    val relationMapper = RelationMapper.load
    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
    implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
    implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
    implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
    logger.info("Extract Publication and relation from publication_xml")
    val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
    {
      new ObjectMapper().readValue(s, classOf[String])
    }).flatMap(s => {
      val d = new PublicationScholexplorerParser
      d.parseObject(s, relationMapper).asScala.iterator})
    val mapper = new ObjectMapper()
    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
    spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
    logger.info("Extract Publication and relation from dataset_xml")
    val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
    {
      new ObjectMapper().readValue(s, classOf[String])
    }).flatMap(s => {
      val d = new DatasetScholexplorerParser
      d.parseObject(s, relationMapper).asScala.iterator})
    spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
    val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset])
    val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication])
    val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
    publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
      .groupByKey(_._1)(Encoders.STRING)
      .agg(EBIAggregator.getPublicationAggregator().toColumn)
      .map(p => p._2)
      .write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
    dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
      .groupByKey(_._1)(Encoders.STRING)
      .agg(EBIAggregator.getDatasetAggregator().toColumn)
      .map(p => p._2)
      .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
    relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
      .groupByKey(_._1)(Encoders.STRING)
      .agg(EBIAggregator.getRelationAggregator().toColumn)
      .map(p => p._2)
      .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
    relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java
@ -0,0 +1,64 @@
 package eu.dnetlib.dhp.sx.ebi.model;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 public class PMArticle implements Serializable {
 	private String pmid;
 	private String date;
 	private PMJournal journal;
 	private String title;
 	private String description;
 	private List<PMAuthor> authors = new ArrayList<>();
 	public String getPmid() {
 		return pmid;
 	}
 	public void setPmid(String pmid) {
 		this.pmid = pmid;
 	}
 	public String getDate() {
 		return date;
 	}
 	public void setDate(String date) {
 		this.date = date;
 	}
 	public PMJournal getJournal() {
 		return journal;
 	}
 	public void setJournal(PMJournal journal) {
 		this.journal = journal;
 	}
 	public String getTitle() {
 		return title;
 	}
 	public void setTitle(String title) {
 		this.title = title;
 	}
 	public String getDescription() {
 		return description;
 	}
 	public void setDescription(String description) {
 		this.description = description;
 	}
 	public List<PMAuthor> getAuthors() {
 		return authors;
 	}
 	public void setAuthors(List<PMAuthor> authors) {
 		this.authors = authors;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java
@ -0,0 +1,31 @@
 package eu.dnetlib.dhp.sx.ebi.model;
 import java.io.Serializable;
 public class PMAuthor implements Serializable {
 	private String lastName;
 	private String foreName;
 	public String getLastName() {
 		return lastName;
 	}
 	public void setLastName(String lastName) {
 		this.lastName = lastName;
 	}
 	public String getForeName() {
 		return foreName;
 	}
 	public void setForeName(String foreName) {
 		this.foreName = foreName;
 	}
 	public String getFullName() {
 		return String.format("%s, %s", this.foreName, this.lastName);
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java
@ -0,0 +1,53 @@
 package eu.dnetlib.dhp.sx.ebi.model;
 import java.io.Serializable;
 public class PMJournal implements Serializable {
 	private String issn;
 	private String volume;
 	private String issue;
 	private String date;
 	private String title;
 	public String getIssn() {
 		return issn;
 	}
 	public void setIssn(String issn) {
 		this.issn = issn;
 	}
 	public String getVolume() {
 		return volume;
 	}
 	public void setVolume(String volume) {
 		this.volume = volume;
 	}
 	public String getIssue() {
 		return issue;
 	}
 	public void setIssue(String issue) {
 		this.issue = issue;
 	}
 	public String getDate() {
 		return date;
 	}
 	public void setDate(String date) {
 		this.date = date;
 	}
 	public String getTitle() {
 		return title;
 	}
 	public void setTitle(String title) {
 		this.title = title;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala
@ -0,0 +1,92 @@
 package eu.dnetlib.dhp.sx.ebi.model
 import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
 class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
  var currentArticle:PMArticle = generateNextArticle()
  override def hasNext: Boolean = currentArticle!= null
  override def next(): PMArticle = {
    val tmp = currentArticle
    currentArticle = generateNextArticle()
    tmp
  }
  def generateNextArticle():PMArticle = {
    var currentAuthor: PMAuthor = null
    var currentJournal: PMJournal = null
    var currNode: String = null
    var currentYear = "0"
    var currentMonth = "01"
    var currentDay = "01"
    while (xml.hasNext) {
      xml.next match {
        case EvElemStart(_, label, _, _) =>
          currNode = label
          label match {
            case "PubmedArticle" => currentArticle = new PMArticle
            case "Author" => currentAuthor = new PMAuthor
            case "Journal" => currentJournal = new PMJournal
            case _ =>
          }
        case EvElemEnd(_, label) =>
          label match {
            case "PubmedArticle" => return currentArticle
            case "Author" => currentArticle.getAuthors.add(currentAuthor)
            case "Journal" => currentArticle.setJournal(currentJournal)
            case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay")
            case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
            case _ =>
          }
        case EvText(text) =>
          if (currNode!= null && text.trim.nonEmpty)
            currNode match {
              case "ArticleTitle" => {
                if (currentArticle.getTitle==null)
                  currentArticle.setTitle(text.trim)
                else
                  currentArticle.setTitle(currentArticle.getTitle + text.trim)
              }
              case "AbstractText" => {
                if (currentArticle.getDescription==null)
                  currentArticle.setDescription(text.trim)
                else
                  currentArticle.setDescription(currentArticle.getDescription + text.trim)
              }
              case "PMID" => currentArticle.setPmid(text.trim)
              case "ISSN" => currentJournal.setIssn(text.trim)
              case "Year" => currentYear = text.trim
              case "Month" => currentMonth = text.trim
              case "Day" => currentDay = text.trim
              case "Volume" => currentJournal.setVolume( text.trim)
              case "Issue" => currentJournal.setIssue (text.trim)
              case "LastName" => {
                if (currentAuthor != null)
                  currentAuthor.setLastName(text.trim)
              }
              case "ForeName" => if (currentAuthor != null)
                currentAuthor.setForeName(text.trim)
              case "Title" =>
                if (currentJournal.getTitle==null)
                  currentJournal.setTitle(text.trim)
                else
                  currentJournal.setTitle(currentJournal.getTitle + text.trim)
              case _ =>
            }
        case _ =>
      }
    }
    null
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
@ -150,6 +150,17 @@ public abstract class AbstractScholexplorerParser {
 		return uk;
 	}
 	protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
 		final String schemeName) {
 		final Qualifier q = new Qualifier();
 		q.setClassid(classId);
 		q.setClassid(className);
 		q.setSchemeid(schemeId);
 		q.setSchemename(schemeName);
 		return q;
 	}
 	protected void generateRelations(
 		RelationMapper relationMapper,
 		Result parsedObject,
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
@ -64,7 +64,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 				currentDate.setQualifier(dateQualifier);
 				parsedObject.setRelevantdate(Collections.singletonList(currentDate));
 			}
 			final String completionStatus = VtdUtilityParser
 				.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
 			final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
@ -149,6 +148,37 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 			inferPid(currentPid);
 			parsedObject.setPid(Collections.singletonList(currentPid));
 			String resolvedURL = null;
 			switch (currentPid.getQualifier().getClassname().toLowerCase()) {
 				case "uniprot":
 					resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
 					break;
 				case "ena":
 					if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
 						resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
 					break;
 				case "chembl":
 					resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
 					break;
 				case "ncbi-n":
 					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
 					break;
 				case "ncbi-p":
 					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
 					break;
 				case "genbank":
 					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
 					break;
 				case "pdb":
 					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
 					break;
 				case "url":
 					resolvedURL = currentPid.getValue();
 					break;
 			}
 			final String sourceId = generateId(
 				currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
 			parsedObject.setId(sourceId);
@ -251,6 +281,11 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 								t -> {
 									final StructuredProperty st = new StructuredProperty();
 									st.setValue(t);
 									st
 										.setQualifier(
 											generateQualifier(
 												"main title", "main title", "dnet:dataCite_title",
 												"dnet:dataCite_title"));
 									return st;
 								})
 							.collect(Collectors.toList()));
@ -282,6 +317,13 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 							.collect(Collectors.toList()));
 			}
 			if (StringUtils.isNotBlank(resolvedURL)) {
 				Instance i = new Instance();
 				i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
 				i.setUrl(Collections.singletonList(resolvedURL));
 				parsedObject.setInstance(Collections.singletonList(i));
 			}
 			result.add(parsedObject);
 			return result;
 		} catch (Throwable e) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
@ -202,6 +202,11 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser
 								t -> {
 									final StructuredProperty st = new StructuredProperty();
 									st.setValue(t);
 									st
 										.setQualifier(
 											generateQualifier(
 												"main title", "main title", "dnet:dataCite_title",
 												"dnet:dataCite_title"));
 									return st;
 								})
 							.collect(Collectors.toList()));
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -282,6 +282,7 @@
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>100</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json
@ -5,6 +5,12 @@
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "np",
    "paramLongName": "numPartitions",
    "paramDescription": "number of dataset partitions",
    "paramRequired": false
  },
  {
    "paramName": "in",
    "paramLongName": "inputPath",
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml
@ -0,0 +1,18 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
@ -0,0 +1,293 @@
 <workflow-app name="merge graphs" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>betaInputGgraphPath</name>
            <description>the beta graph root path</description>
        </property>
        <property>
            <name>prodInputGgraphPath</name>
            <description>the production graph root path</description>
        </property>
        <property>
            <name>graphOutputPath</name>
            <description>the output merged graph root path</description>
        </property>
        <property>
            <name>priority</name>
            <description>decides from which infrastructure the content must win in case of ID clash</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
 	<start to="fork_merge_graph"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <fork name="fork_merge_graph">
        <path start="merge_publication"/>
        <path start="merge_dataset"/>
        <path start="merge_otherresearchproduct"/>
        <path start="merge_software"/>
        <path start="merge_datasource"/>
        <path start="merge_organization"/>
        <path start="merge_project"/>
        <path start="merge_relation"/>
    </fork>
    <action name="merge_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge publications</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/publication</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/publication</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_dataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge datasets</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/dataset</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/dataset</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_otherresearchproduct">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge otherresearchproducts</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/otherresearchproduct</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/otherresearchproduct</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_software">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge softwares</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/software</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/software</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_datasource">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge datasources</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/datasource</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/datasource</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_organization">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge organizations</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/organization</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/organization</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_project">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge projects</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/project</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/project</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge relations</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/relation</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/relation</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <join name="wait_merge" to="End"/>
    <end name="End"/>
 </workflow-app>
--- a/Show More
+++ b/Show More