[DOIBoost author merger] merge with beta and added new test to verify the match for 0000-0002-4333-2748

2023-05-23 15:44:03 +02:00 · 2023-05-23 15:44:03 +02:00 · 2aaa63dfa2
parent 2bb0a737aa a235d2a24a
commit 2aaa63dfa2
681 changed files with 38544 additions and 21694 deletions
--- a/.gitignore
+++ b/.gitignore
@ -25,4 +25,4 @@ spark-warehouse
 /**/job-override.properties
 /**/*.log
 /**/.factorypath
-
+/**/.scalafmt.conf
--- a/.scalafmt.conf
+++ b/.scalafmt.conf
@ -0,0 +1,21 @@
+style = defaultWithAlign
+
+align.openParenCallSite = false
+align.openParenDefnSite = false
+align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
+continuationIndent.callSite = 2
+continuationIndent.defnSite = 2
+danglingParentheses = true
+indentOperator = spray
+maxColumn = 120
+newlines.alwaysBeforeTopLevelStatements = true
+project.excludeFilters = [".*\\.sbt"]
+rewrite.rules = [AvoidInfix]
+rewrite.rules = [ExpandImportSelectors]
+rewrite.rules = [RedundantBraces]
+rewrite.rules = [RedundantParens]
+rewrite.rules = [SortImports]
+rewrite.rules = [SortModifiers]
+rewrite.rules = [PreferCurlyFors]
+spaces.inImportCurlyBraces = false
+unindentTopLevelOperators = true
--- a/dhp-build/dhp-build-assembly-resources/pom.xml
+++ b/dhp-build/dhp-build-assembly-resources/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp-build</artifactId>
-        <version>1.2.4-SNAPSHOT</version>
+        <version>1.2.5-SNAPSHOT</version>
    </parent>

    <artifactId>dhp-build-assembly-resources</artifactId>
--- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml
+++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp-build</artifactId>
-        <version>1.2.4-SNAPSHOT</version>
+        <version>1.2.5-SNAPSHOT</version>
    </parent>

    <artifactId>dhp-build-properties-maven-plugin</artifactId>
--- a/dhp-build/dhp-code-style/pom.xml
+++ b/dhp-build/dhp-code-style/pom.xml
@ -5,7 +5,7 @@

    <groupId>eu.dnetlib.dhp</groupId>
    <artifactId>dhp-code-style</artifactId>
-    <version>1.2.4-SNAPSHOT</version>
+    <version>1.2.5-SNAPSHOT</version>

    <packaging>jar</packaging>

@ -47,12 +47,16 @@
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-site-plugin</artifactId>
                    <version>3.9.1</version>
+                    <configuration>
+                        <skip>true</skip>
+                    </configuration>
                </plugin>
            </plugins>
        </pluginManagement>
    </build>

    <properties>
+
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
    </properties>
--- a/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
+++ b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
@ -0,0 +1,21 @@
+style = defaultWithAlign
+
+align.openParenCallSite = false
+align.openParenDefnSite = false
+align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
+continuationIndent.callSite = 2
+continuationIndent.defnSite = 2
+danglingParentheses = true
+indentOperator = spray
+maxColumn = 120
+newlines.alwaysBeforeTopLevelStatements = true
+project.excludeFilters = [".*\\.sbt"]
+rewrite.rules = [AvoidInfix]
+rewrite.rules = [ExpandImportSelectors]
+rewrite.rules = [RedundantBraces]
+rewrite.rules = [RedundantParens]
+rewrite.rules = [SortImports]
+rewrite.rules = [SortModifiers]
+rewrite.rules = [PreferCurlyFors]
+spaces.inImportCurlyBraces = false
+unindentTopLevelOperators = true
--- a/dhp-build/pom.xml
+++ b/dhp-build/pom.xml
@ -4,7 +4,7 @@
 	<parent>
 		<groupId>eu.dnetlib.dhp</groupId>
 		<artifactId>dhp</artifactId>
-		<version>1.2.4-SNAPSHOT</version>
+		<version>1.2.5-SNAPSHOT</version>
 	</parent>
 	<artifactId>dhp-build</artifactId>
 	<packaging>pom</packaging>
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -5,7 +5,7 @@
 	<parent>
 		<groupId>eu.dnetlib.dhp</groupId>
 		<artifactId>dhp</artifactId>
-		<version>1.2.4-SNAPSHOT</version>
+		<version>1.2.5-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>

 	</parent>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
@ -10,6 +10,12 @@ public class Constants {
 	public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
 	public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();

+	public static final String ROR_NS_PREFIX = "ror_________";
+
+	public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";
+
+	public static final String ROR_DATASOURCE_NAME = "Research Organization Registry (ROR)";
+
 	public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";

 	private Constants() {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MDStoreInfo.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MDStoreInfo.java
@ -0,0 +1,100 @@
+
+package eu.dnetlib.dhp.common;
+
+/**
+ * This utility represent the Metadata Store information
+ * needed during the migration from mongo to HDFS to store
+  */
+public class MDStoreInfo {
+	private String mdstore;
+	private String currentId;
+	private Long latestTimestamp;
+
+	/**
+	 * Instantiates a new Md store info.
+	 */
+	public MDStoreInfo() {
+	}
+
+	/**
+	 * Instantiates a new Md store info.
+	 *
+	 * @param mdstore         the mdstore
+	 * @param currentId       the current id
+	 * @param latestTimestamp the latest timestamp
+	 */
+	public MDStoreInfo(String mdstore, String currentId, Long latestTimestamp) {
+		this.mdstore = mdstore;
+		this.currentId = currentId;
+		this.latestTimestamp = latestTimestamp;
+	}
+
+	/**
+	 * Gets mdstore.
+	 *
+	 * @return the mdstore
+	 */
+	public String getMdstore() {
+		return mdstore;
+	}
+
+	/**
+	 * Sets mdstore.
+	 *
+	 * @param mdstore the mdstore
+	 * @return the mdstore
+	 */
+	public MDStoreInfo setMdstore(String mdstore) {
+		this.mdstore = mdstore;
+		return this;
+	}
+
+	/**
+	 * Gets current id.
+	 *
+	 * @return the current id
+	 */
+	public String getCurrentId() {
+		return currentId;
+	}
+
+	/**
+	 * Sets current id.
+	 *
+	 * @param currentId the current id
+	 * @return the current id
+	 */
+	public MDStoreInfo setCurrentId(String currentId) {
+		this.currentId = currentId;
+		return this;
+	}
+
+	/**
+	 * Gets latest timestamp.
+	 *
+	 * @return the latest timestamp
+	 */
+	public Long getLatestTimestamp() {
+		return latestTimestamp;
+	}
+
+	/**
+	 * Sets latest timestamp.
+	 *
+	 * @param latestTimestamp the latest timestamp
+	 * @return the latest timestamp
+	 */
+	public MDStoreInfo setLatestTimestamp(Long latestTimestamp) {
+		this.latestTimestamp = latestTimestamp;
+		return this;
+	}
+
+	@Override
+	public String toString() {
+		return "MDStoreInfo{" +
+			"mdstore='" + mdstore + '\'' +
+			", currentId='" + currentId + '\'' +
+			", latestTimestamp=" + latestTimestamp +
+			'}';
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
@ -5,13 +5,71 @@ import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
+import java.util.Optional;

 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;

 public class MakeTarArchive implements Serializable {

+	private static final Logger log = LoggerFactory.getLogger(MakeTarArchive.class);
+
+	public static void main(String[] args) throws Exception {
+		String jsonConfiguration = IOUtils
+			.toString(
+				MakeTarArchive.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/common/input_maketar_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+		parser.parseArgument(args);
+
+		final String outputPath = parser.get("hdfsPath");
+		log.info("hdfsPath: {}", outputPath);
+
+		final String hdfsNameNode = parser.get("nameNode");
+		log.info("nameNode: {}", hdfsNameNode);
+
+		final String inputPath = parser.get("sourcePath");
+		log.info("input path : {}", inputPath);
+
+		final int gBperSplit = Optional
+			.ofNullable(parser.get("splitSize"))
+			.map(Integer::valueOf)
+			.orElse(10);
+
+		Configuration conf = new Configuration();
+		conf.set("fs.defaultFS", hdfsNameNode);
+
+		FileSystem fileSystem = FileSystem.get(conf);
+
+		makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit);
+
+	}
+
+	public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
+		throws IOException {
+
+		RemoteIterator<LocatedFileStatus> dirIterator = fileSystem.listLocatedStatus(new Path(inputPath));
+
+		while (dirIterator.hasNext()) {
+			LocatedFileStatus fileStatus = dirIterator.next();
+
+			Path p = fileStatus.getPath();
+			String pathString = p.toString();
+			String entity = pathString.substring(pathString.lastIndexOf("/") + 1);
+
+			MakeTarArchive.tarMaxSize(fileSystem, pathString, outputPath + "/" + entity, entity, gBperSplit);
+		}
+	}
+
 	private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
 		Path hdfsWritePath = new Path(outputPath);
 		if (fileSystem.exists(hdfsWritePath)) {
@ -21,7 +79,7 @@ public class MakeTarArchive implements Serializable {
 		return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream());
 	}

-	private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
+	private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dirName)
 		throws IOException {

 		Path hdfsWritePath = new Path(outputPath);
@ -37,7 +95,7 @@ public class MakeTarArchive implements Serializable {
 					new Path(inputPath), true);

 			while (iterator.hasNext()) {
-				writeCurrentFile(fileSystem, dir_name, iterator, ar, 0);
+				writeCurrentFile(fileSystem, dirName, iterator, ar, 0);
 			}

 		}
@ -59,32 +117,30 @@ public class MakeTarArchive implements Serializable {
 					new Path(inputPath), true);
 			boolean next = fileStatusListIterator.hasNext();
 			while (next) {
-				TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar");
+				try (TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar")) {

-				long current_size = 0;
-				while (next && current_size < bytesPerSplit) {
-					current_size = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, current_size);
-					next = fileStatusListIterator.hasNext();
+					long currentSize = 0;
+					while (next && currentSize < bytesPerSplit) {
+						currentSize = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, currentSize);
+						next = fileStatusListIterator.hasNext();

+					}
+
+					partNum += 1;
 				}
-
-				partNum += 1;
-				ar.close();
 			}
-
 		}
-
 	}

-	private static long writeCurrentFile(FileSystem fileSystem, String dir_name,
+	private static long writeCurrentFile(FileSystem fileSystem, String dirName,
 		RemoteIterator<LocatedFileStatus> fileStatusListIterator,
-		TarArchiveOutputStream ar, long current_size) throws IOException {
+		TarArchiveOutputStream ar, long currentSize) throws IOException {
 		LocatedFileStatus fileStatus = fileStatusListIterator.next();

 		Path p = fileStatus.getPath();
-		String p_string = p.toString();
-		if (!p_string.endsWith("_SUCCESS")) {
-			String name = p_string.substring(p_string.lastIndexOf("/") + 1);
+		String pString = p.toString();
+		if (!pString.endsWith("_SUCCESS")) {
+			String name = pString.substring(pString.lastIndexOf("/") + 1);
 			if (name.startsWith("part-") & name.length() > 10) {
 				String tmp = name.substring(0, 10);
 				if (name.contains(".")) {
@ -92,9 +148,9 @@ public class MakeTarArchive implements Serializable {
 				}
 				name = tmp;
 			}
-			TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
+			TarArchiveEntry entry = new TarArchiveEntry(dirName + "/" + name);
 			entry.setSize(fileStatus.getLen());
-			current_size += fileStatus.getLen();
+			currentSize += fileStatus.getLen();
 			ar.putArchiveEntry(entry);

 			InputStream is = fileSystem.open(fileStatus.getPath());
@ -110,7 +166,7 @@ public class MakeTarArchive implements Serializable {
 			ar.closeArchiveEntry();

 		}
-		return current_size;
+		return currentSize;
 	}

 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java
@ -1,12 +1,12 @@

 package eu.dnetlib.dhp.common;

+import static com.mongodb.client.model.Sorts.descending;
+
 import java.io.Closeable;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Optional;
+import java.util.*;
+import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;

 import org.apache.commons.lang3.StringUtils;
@ -38,6 +38,26 @@ public class MdstoreClient implements Closeable {
 		this.db = getDb(client, dbName);
 	}

+	private Long parseTimestamp(Document f) {
+		if (f == null || !f.containsKey("timestamp"))
+			return null;
+
+		Object ts = f.get("timestamp");
+
+		return Long.parseLong(ts.toString());
+	}
+
+	public Long getLatestTimestamp(final String collectionId) {
+		MongoCollection<Document> collection = db.getCollection(collectionId);
+		FindIterable<Document> result = collection.find().sort(descending("timestamp")).limit(1);
+		if (result == null) {
+			return null;
+		}
+
+		Document f = result.first();
+		return parseTimestamp(f);
+	}
+
 	public MongoCollection<Document> mdStore(final String mdId) {
 		BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();

@ -54,6 +74,16 @@ public class MdstoreClient implements Closeable {
 		return getColl(db, currentId, true);
 	}

+	public List<MDStoreInfo> mdStoreWithTimestamp(final String mdFormat, final String mdLayout,
+		final String mdInterpretation) {
+		Map<String, String> res = validCollections(mdFormat, mdLayout, mdInterpretation);
+		return res
+			.entrySet()
+			.stream()
+			.map(e -> new MDStoreInfo(e.getKey(), e.getValue(), getLatestTimestamp(e.getValue())))
+			.collect(Collectors.toList());
+	}
+
 	public Map<String, String> validCollections(
 		final String mdFormat, final String mdLayout, final String mdInterpretation) {

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -1,18 +1,18 @@

 package eu.dnetlib.dhp.common;

+import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
+import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.text.WordUtils;

+import com.ctc.wstx.dtd.LargePrefixedNameSet;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.hash.Hashing;

@ -29,7 +29,19 @@ public class PacePerson {
 	private List<String> fullname = Lists.newArrayList();
 	private final String original;

-	private static Set<String> particles = null;
+	private static Set<String> particles;
+
+	static {
+		try {
+			particles = new HashSet<>(IOUtils
+				.readLines(
+					PacePerson.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/common/name_particles.txt")));
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+	}

 	/**
 	 * Capitalizes a string
@ -37,29 +49,20 @@ public class PacePerson {
 	 * @param s the string to capitalize
 	 * @return the input string with capital letter
 	 */
-	public static final String capitalize(final String s) {
+	public static String capitalize(final String s) {
+		if (particles.contains(s)) {
+			return s;
+		}
 		return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
 	}

 	/**
 	 * Adds a dot to a string with length equals to 1
 	 */
-	public static final String dotAbbreviations(final String s) {
+	public static String dotAbbreviations(final String s) {
 		return s.length() == 1 ? s + "." : s;
 	}

-	public static Set<String> loadFromClasspath(final String classpath) {
-		final Set<String> h = new HashSet<>();
-		try {
-			for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
-				h.add(s);
-			}
-		} catch (final Throwable e) {
-			return new HashSet<>();
-		}
-		return h;
-	}
-
 	/**
 	 * The constructor of the class. It fills the fields of the class basing on the input fullname.
 	 *
@ -128,10 +131,6 @@ public class PacePerson {
 	}

 	private List<String> splitTerms(final String s) {
-		if (particles == null) {
-			particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
-		}
-
 		final List<String> list = Lists.newArrayList();
 		for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
 			if (!particles.contains(part.toLowerCase())) {
@ -187,17 +186,36 @@ public class PacePerson {
 	}

 	public List<String> getCapitalFirstnames() {
-		return Lists
-			.newArrayList(
-				Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
+		return Optional
+			.ofNullable(getNameWithAbbreviations())
+			.map(
+				name -> name
+					.stream()
+					.map(PacePerson::capitalize)
+					.collect(Collectors.toList()))
+			.orElse(new ArrayList<>());
 	}

 	public List<String> getCapitalSurname() {
-		return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
+		return Optional
+			.ofNullable(getSurname())
+			.map(
+				surname -> surname
+					.stream()
+					.map(PacePerson::capitalize)
+					.collect(Collectors.toList()))
+			.orElse(new ArrayList<>());
 	}

 	public List<String> getNameWithAbbreviations() {
-		return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
+		return Optional
+			.ofNullable(getName())
+			.map(
+				name -> name
+					.stream()
+					.map(PacePerson::dotAbbreviations)
+					.collect(Collectors.toList()))
+			.orElse(new ArrayList<>());
 	}

 	public boolean isAccurate() {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/action/ReadDatasourceMasterDuplicateFromDB.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/action/ReadDatasourceMasterDuplicateFromDB.java
@ -0,0 +1,81 @@
+
+package eu.dnetlib.dhp.common.action;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.common.DbClient;
+import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+
+public class ReadDatasourceMasterDuplicateFromDB {
+
+	private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
+		+
+		"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
+
+	public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
+		throws IOException {
+		int count = 0;
+		try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
+			Configuration conf = new Configuration();
+			conf.set("fs.defaultFS", hdfsNameNode);
+			FileSystem fileSystem = FileSystem.get(conf);
+			FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
+
+			log.info("running query: {}", QUERY);
+			log.info("storing results in: {}", hdfsPath);
+
+			try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
+				dbClient.processResults(QUERY, rs -> writeMap(datasourceMasterMap(rs), writer));
+				count++;
+			}
+		}
+		return count;
+	}
+
+	private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
+		try {
+			final MasterDuplicate md = new MasterDuplicate();
+
+			final String duplicateId = rs.getString("duplicateId");
+			final String masterId = rs.getString("masterId");
+			final String masterName = rs.getString("masterName");
+
+			md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
+			md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
+			md.setMasterName(masterName);
+
+			return md;
+		} catch (final SQLException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static void writeMap(final MasterDuplicate dm, final BufferedWriter writer) {
+		try {
+			writer.write(OBJECT_MAPPER.writeValueAsString(dm));
+			writer.newLine();
+		} catch (final IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/action/model/MasterDuplicate.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/action/model/MasterDuplicate.java
@ -0,0 +1,38 @@
+
+package eu.dnetlib.dhp.common.action.model;
+
+import java.io.Serializable;
+
+/**
+ * @author miriam.baglioni
+ * @Date 21/07/22
+ */
+public class MasterDuplicate implements Serializable {
+	private String duplicateId;
+	private String masterId;
+	private String masterName;
+
+	public String getDuplicateId() {
+		return duplicateId;
+	}
+
+	public void setDuplicateId(String duplicateId) {
+		this.duplicateId = duplicateId;
+	}
+
+	public String getMasterId() {
+		return masterId;
+	}
+
+	public void setMasterId(String masterId) {
+		this.masterId = masterId;
+	}
+
+	public String getMasterName() {
+		return masterName;
+	}
+
+	public void setMasterName(String masterName) {
+		this.masterName = masterName;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -3,10 +3,13 @@ package eu.dnetlib.dhp.common.api;

 import java.io.*;
 import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URL;
 import java.util.concurrent.TimeUnit;

 import org.apache.http.HttpHeaders;
 import org.apache.http.entity.ContentType;
+import org.jetbrains.annotations.NotNull;

 import com.google.gson.Gson;

@ -60,33 +63,31 @@ public class ZenodoAPIClient implements Serializable {
 	 */
 	public int newDeposition() throws IOException {
 		String json = "{}";
-		OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
-
-		RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
-
-		Request request = new Request.Builder()
-			.url(urlString)
-			.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
-			.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
-			.post(body)
-			.build();
-
-		try (Response response = httpClient.newCall(request).execute()) {
-
-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
-
-			// Get response body
-			json = response.body().string();
-
-			ZenodoModel newSubmission = new Gson().fromJson(json, ZenodoModel.class);
-			this.bucket = newSubmission.getLinks().getBucket();
-			this.deposition_id = newSubmission.getId();
-
-			return response.code();

+		URL url = new URL(urlString);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setRequestMethod("POST");
+		conn.setDoOutput(true);
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = json.getBytes("utf-8");
+			os.write(input, 0, input.length);
 		}

+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+		conn.disconnect();
+
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
+		this.bucket = newSubmission.getLinks().getBucket();
+		this.deposition_id = newSubmission.getId();
+
+		return responseCode;
 	}

 	/**
@ -94,28 +95,48 @@ public class ZenodoAPIClient implements Serializable {
 	 *
 	 * @param is the inputStream for the file to upload
 	 * @param file_name the name of the file as it will appear on Zenodo
-	 * @param len the size of the file
 	 * @return the response code
 	 */
-	public int uploadIS(InputStream is, String file_name, long len) throws IOException {
-		OkHttpClient httpClient = new OkHttpClient.Builder()
-			.writeTimeout(600, TimeUnit.SECONDS)
-			.readTimeout(600, TimeUnit.SECONDS)
-			.connectTimeout(600, TimeUnit.SECONDS)
-			.build();
+	public int uploadIS(InputStream is, String file_name) throws IOException {

-		Request request = new Request.Builder()
-			.url(bucket + "/" + file_name)
-			.addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers
-			.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
-			.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
-			.build();
+		URL url = new URL(bucket + "/" + file_name);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("PUT");
+
+		byte[] buf = new byte[8192];
+		int length;
+		try (OutputStream os = conn.getOutputStream()) {
+			while ((length = is.read(buf)) != -1) {
+				os.write(buf, 0, length);
+			}

-		try (Response response = httpClient.newCall(request).execute()) {
-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
-			return response.code();
 		}
+		int responseCode = conn.getResponseCode();
+		if (!checkOKStatus(responseCode)) {
+			throw new IOException("Unexpected code " + responseCode + getBody(conn));
+		}
+
+		return responseCode;
+	}
+
+	@NotNull
+	private String getBody(HttpURLConnection conn) throws IOException {
+		String body = "{}";
+		try (BufferedReader br = new BufferedReader(
+			new InputStreamReader(conn.getInputStream(), "utf-8"))) {
+			StringBuilder response = new StringBuilder();
+			String responseLine = null;
+			while ((responseLine = br.readLine()) != null) {
+				response.append(responseLine.trim());
+			}
+
+			body = response.toString();
+
+		}
+		return body;
 	}

 	/**
@ -127,26 +148,34 @@ public class ZenodoAPIClient implements Serializable {
 	 */
 	public int sendMretadata(String metadata) throws IOException {

-		OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
+		URL url = new URL(urlString + "/" + deposition_id);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("PUT");

-		RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON);
-
-		Request request = new Request.Builder()
-			.url(urlString + "/" + deposition_id)
-			.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
-			.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
-			.put(body)
-			.build();
-
-		try (Response response = httpClient.newCall(request).execute()) {
-
-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
-
-			return response.code();
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = metadata.getBytes("utf-8");
+			os.write(input, 0, input.length);

 		}

+		final int responseCode = conn.getResponseCode();
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + getBody(conn));
+
+		return responseCode;
+
+	}
+
+	private boolean checkOKStatus(int responseCode) {
+
+		if (HttpURLConnection.HTTP_OK != responseCode ||
+			HttpURLConnection.HTTP_CREATED != responseCode)
+			return true;
+		return false;
 	}

 	/**
@ -155,6 +184,7 @@ public class ZenodoAPIClient implements Serializable {
 	 * @return response code
 	 * @throws IOException
 	 */
+	@Deprecated
 	public int publish() throws IOException {

 		String json = "{}";
@ -191,31 +221,37 @@ public class ZenodoAPIClient implements Serializable {
 	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
-		setDepositionId(concept_rec_id);
+		setDepositionId(concept_rec_id, 1);
 		String json = "{}";

-		OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
+		URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();

-		RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("POST");

-		Request request = new Request.Builder()
-			.url(urlString + "/" + deposition_id + "/actions/newversion")
-			.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
-			.post(body)
-			.build();
-
-		try (Response response = httpClient.newCall(request).execute()) {
-
-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
-
-			ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
-			String latest_draft = zenodoModel.getLinks().getLatest_draft();
-			deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
-			bucket = getBucket(latest_draft);
-			return response.code();
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = json.getBytes("utf-8");
+			os.write(input, 0, input.length);

 		}
+
+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
+		String latest_draft = zenodoModel.getLinks().getLatest_draft();
+		deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
+		bucket = getBucket(latest_draft);
+
+		return responseCode;
+
 	}

 	/**
@ -233,29 +269,38 @@ public class ZenodoAPIClient implements Serializable {

 		this.deposition_id = deposition_id;

-		OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
+		String json = "{}";

-		Request request = new Request.Builder()
-			.url(urlString + "/" + deposition_id)
-			.addHeader("Authorization", "Bearer " + access_token)
-			.build();
-
-		try (Response response = httpClient.newCall(request).execute()) {
-
-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
-
-			ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
-			bucket = zenodoModel.getLinks().getBucket();
-			return response.code();
+		URL url = new URL(urlString + "/" + deposition_id);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();

+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setRequestMethod("POST");
+		conn.setDoOutput(true);
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = json.getBytes("utf-8");
+			os.write(input, 0, input.length);
 		}

+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+		conn.disconnect();
+
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
+		bucket = zenodoModel.getLinks().getBucket();
+
+		return responseCode;
+
 	}

-	private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException {
+	private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {

-		ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class);
+		ZenodoModelList zenodoModelList = new Gson()
+			.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);

 		for (ZenodoModel zm : zenodoModelList) {
 			if (zm.getConceptrecid().equals(concept_rec_id)) {
@ -263,55 +308,57 @@ public class ZenodoAPIClient implements Serializable {
 				return;
 			}
 		}
-
-		throw new MissingConceptDoiException("The concept record id specified was missing in the list of depositions");
+		if (zenodoModelList.size() == 0)
+			throw new MissingConceptDoiException(
+				"The concept record id specified was missing in the list of depositions");
+		setDepositionId(concept_rec_id, page + 1);

 	}

-	private String getPrevDepositions() throws IOException {
-		OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
+	private String getPrevDepositions(String page) throws IOException {

-		Request request = new Request.Builder()
-			.url(urlString)
-			.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
-			.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
-			.get()
-			.build();
+		HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
+		urlBuilder.addQueryParameter("page", page);

-		try (Response response = httpClient.newCall(request).execute()) {
+		URL url = new URL(urlBuilder.build().toString());
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("GET");

-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
+		String body = getBody(conn);

-			return response.body().string();
+		int responseCode = conn.getResponseCode();

-		}
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		return body;

 	}

-	private String getBucket(String url) throws IOException {
-		OkHttpClient httpClient = new OkHttpClient.Builder()
-			.connectTimeout(600, TimeUnit.SECONDS)
-			.build();
+	private String getBucket(String inputUurl) throws IOException {

-		Request request = new Request.Builder()
-			.url(url)
-			.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
-			.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
-			.get()
-			.build();
+		URL url = new URL(inputUurl);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("GET");

-		try (Response response = httpClient.newCall(request).execute()) {
+		String body = getBody(conn);

-			if (!response.isSuccessful())
-				throw new IOException("Unexpected code " + response + response.body().string());
+		int responseCode = conn.getResponseCode();

-			// Get response body
-			ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);

-			return zenodoModel.getLinks().getBucket();
+		ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);

-		}
+		return zenodoModel.getLinks().getBucket();

 	}

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/DecompressTarGz.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/DecompressTarGz.java
@ -0,0 +1,40 @@
+
+package eu.dnetlib.dhp.common.collection;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class DecompressTarGz {
+
+	public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
+
+		FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
+		try (TarArchiveInputStream tais = new TarArchiveInputStream(
+			new GzipCompressorInputStream(inputFileStream))) {
+			TarArchiveEntry entry = null;
+			while ((entry = tais.getNextTarEntry()) != null) {
+				if (!entry.isDirectory()) {
+					try (
+						FSDataOutputStream out = fs
+							.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
+						GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
+
+						IOUtils.copy(tais, gzipOs);
+
+					}
+
+				}
+			}
+		}
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.common.vocabulary;
 import java.io.Serializable;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Optional;

 import org.apache.commons.lang3.StringUtils;
@ -66,21 +67,39 @@ public class Vocabulary implements Serializable {
 	}

 	public Qualifier getTermAsQualifier(final String termId) {
-		if (StringUtils.isBlank(termId)) {
+		return getTermAsQualifier(termId, false);
+	}
+
+	public Qualifier getTermAsQualifier(final String termId, boolean strict) {
+		final VocabularyTerm term = getTerm(termId);
+		if (Objects.nonNull(term)) {
+			return OafMapperUtils.qualifier(term.getId(), term.getName(), getId(), getName());
+		} else if (Objects.isNull(term) && strict) {
 			return OafMapperUtils.unknown(getId(), getName());
-		} else if (termExists(termId)) {
-			final VocabularyTerm t = getTerm(termId);
-			return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
 		} else {
 			return OafMapperUtils.qualifier(termId, termId, getId(), getName());
 		}
 	}

 	public Qualifier getSynonymAsQualifier(final String syn) {
+		return getSynonymAsQualifier(syn, false);
+	}
+
+	public Qualifier getSynonymAsQualifier(final String syn, boolean strict) {
 		return Optional
 			.ofNullable(getTermBySynonym(syn))
-			.map(term -> getTermAsQualifier(term.getId()))
+			.map(term -> getTermAsQualifier(term.getId(), strict))
 			.orElse(null);
 	}

+	public Qualifier lookup(String id) {
+		return lookup(id, false);
+	}
+
+	public Qualifier lookup(String id, boolean strict) {
+		return Optional
+			.ofNullable(getSynonymAsQualifier(id, strict))
+			.orElse(getTermAsQualifier(id, strict));
+	}
+
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java
@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
 		vocs.put(id.toLowerCase(), new Vocabulary(id, name));
 	}

+	public Optional<Vocabulary> find(final String vocId) {
+		return Optional
+			.ofNullable(vocId)
+			.map(String::toLowerCase)
+			.map(vocs::get);
+	}
+
 	public void addTerm(final String vocId, final String id, final String name) {
 		if (vocabularyExists(vocId)) {
 			vocs.get(vocId.toLowerCase()).addTerm(id, name);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DispatchEntitiesSparkJob.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DispatchEntitiesSparkJob.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.oa.dedup;
+package eu.dnetlib.dhp.oa.merge;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

@ -38,7 +38,7 @@ public class DispatchEntitiesSparkJob {
 					.requireNonNull(
 						DispatchEntitiesSparkJob.class
 							.getResourceAsStream(
-								"/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json")));
+								"/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json")));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.oa.dedup;
+package eu.dnetlib.dhp.oa.merge;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
@ -53,7 +53,7 @@ public class GroupEntitiesSparkJob {
 			.toString(
 				GroupEntitiesSparkJob.class
 					.getResourceAsStream(
-						"/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json"));
+						"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -1,6 +1,8 @@

 package eu.dnetlib.dhp.schema.oaf.utils;

+import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
+
 import java.time.LocalDate;
 import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
@ -11,12 +13,15 @@ import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Encoders;

+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;

-import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -35,6 +40,127 @@ public class GraphCleaningFunctions extends CleaningFunctions {

 	public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;

+	public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
+		if (ModelSupport.isSubClass(value, Result.class)) {
+			final Result res = (Result) value;
+			if (shouldCleanContext(res, verifyParam)) {
+				res
+					.setContext(
+						res
+							.getContext()
+							.stream()
+							.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
+							.collect(Collectors.toList()));
+			}
+			return (T) res;
+		} else {
+			return value;
+		}
+	}
+
+	private static boolean shouldCleanContext(Result res, String verifyParam) {
+		boolean titleMatch = res
+			.getTitle()
+			.stream()
+			.filter(
+				t -> t
+					.getQualifier()
+					.getClassid()
+					.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
+			.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()));
+
+		return titleMatch && Objects.nonNull(res.getContext());
+	}
+
+	public static <T extends Oaf> T cleanCountry(T value, String[] verifyParam, Set<String> hostedBy,
+		String collectedfrom, String country) {
+		if (ModelSupport.isSubClass(value, Result.class)) {
+			final Result res = (Result) value;
+			if (res.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
+				!res.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
+				return (T) res;
+			}
+
+			List<StructuredProperty> ids = getPidsAndAltIds(res).collect(Collectors.toList());
+			if (ids
+				.stream()
+				.anyMatch(
+					p -> p
+						.getQualifier()
+						.getClassid()
+						.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
+				res
+					.setCountry(
+						res
+							.getCountry()
+							.stream()
+							.filter(
+								c -> toTakeCountry(c, country))
+							.collect(Collectors.toList()));
+			}
+
+			return (T) res;
+		} else {
+			return value;
+		}
+	}
+
+	private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
+		final Stream<StructuredProperty> resultPids = Optional
+			.ofNullable(r.getPid())
+			.map(Collection::stream)
+			.orElse(Stream.empty());
+
+		final Stream<StructuredProperty> instancePids = Optional
+			.ofNullable(r.getInstance())
+			.map(
+				instance -> instance
+					.stream()
+					.flatMap(
+						i -> Optional
+							.ofNullable(i.getPid())
+							.map(Collection::stream)
+							.orElse(Stream.empty())))
+			.orElse(Stream.empty());
+
+		final Stream<StructuredProperty> instanceAltIds = Optional
+			.ofNullable(r.getInstance())
+			.map(
+				instance -> instance
+					.stream()
+					.flatMap(
+						i -> Optional
+							.ofNullable(i.getAlternateIdentifier())
+							.map(Collection::stream)
+							.orElse(Stream.empty())))
+			.orElse(Stream.empty());
+
+		return Stream
+			.concat(
+				Stream.concat(resultPids, instancePids),
+				instanceAltIds);
+	}
+
+	private static boolean pidInParam(String value, String[] verifyParam) {
+		for (String s : verifyParam)
+			if (value.startsWith(s))
+				return true;
+		return false;
+	}
+
+	private static boolean toTakeCountry(Country c, String country) {
+		// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
+		// inserted via propagation
+		if (!Optional.ofNullable(c.getDataInfo()).isPresent())
+			return true;
+		if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
+			return true;
+		return !(c
+			.getClassid()
+			.equalsIgnoreCase(country) &&
+			c.getDataInfo().getInferenceprovenance().equals("propagation"));
+	}
+
 	public static <T extends Oaf> T fixVocabularyNames(T value) {
 		if (value instanceof Datasource) {
 			// nothing to clean here
@ -101,7 +227,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 									.orElse(true))
 							.orElse(true))
 					.orElse(true))) {
-			return false;
+			return true;
 		}

 		if (value instanceof Datasource) {
@ -191,8 +317,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 						qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
 			}
 			if (Objects.nonNull(r.getSubject())) {
-				r
-					.setSubject(
+				List<Subject> subjects = Lists
+					.newArrayList(
 						r
 							.getSubject()
 							.stream()
@ -200,8 +326,26 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 							.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
 							.filter(sp -> Objects.nonNull(sp.getQualifier()))
 							.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
+							.map(s -> {
+								if ("dnet:result_subject".equals(s.getQualifier().getClassid())) {
+									s.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
+									s.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
+								}
+								return s;
+							})
 							.map(GraphCleaningFunctions::cleanValue)
-							.collect(Collectors.toList()));
+							.collect(
+								Collectors
+									.toMap(
+										s -> Optional
+											.ofNullable(s.getQualifier())
+											.map(q -> q.getClassid() + s.getValue())
+											.orElse(s.getValue()),
+										Function.identity(),
+										(s1, s2) -> Collections
+											.min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator())))
+							.values());
+				r.setSubject(subjects);
 			}
 			if (Objects.nonNull(r.getTitle())) {
 				r
@ -321,7 +465,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 					if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
 						i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
 					}
-					if (Objects.isNull(i.getRefereed())) {
+					if (Objects.isNull(i.getRefereed()) || StringUtils.isBlank(i.getRefereed().getClassid())) {
 						i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
 					}
 					if (Objects.nonNull(i.getDateofacceptance())) {
@ -382,14 +526,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 									.filter(p -> StringUtils.isNotBlank(p.getValue()))
 									.map(p -> {
 										// hack to distinguish orcid from orcid_pending
-										String pidProvenance = Optional
-											.ofNullable(p.getDataInfo())
-											.map(
-												d -> Optional
-													.ofNullable(d.getProvenanceaction())
-													.map(Qualifier::getClassid)
-													.orElse(""))
-											.orElse("");
+										String pidProvenance = getProvenance(p.getDataInfo());
 										if (p
 											.getQualifier()
 											.getClassid()
@ -520,6 +657,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 		return s;
 	}

+	protected static Subject cleanValue(Subject s) {
+		s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
+		return s;
+	}
+
 	protected static Field<String> cleanValue(Field<String> s) {
 		s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
 		return s;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
@ -3,6 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;

 import static eu.dnetlib.dhp.schema.common.ModelConstants.*;

+import java.sql.Array;
+import java.sql.SQLException;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.function.Function;
@ -12,6 +14,7 @@ import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;

 import eu.dnetlib.dhp.schema.common.AccessRightComparator;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;

@ -47,6 +50,17 @@ public class OafMapperUtils {
 	}

 	public static Result mergeResults(Result left, Result right) {
+
+		final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
+		final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
+
+		if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
+			return left;
+		}
+		if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
+			return right;
+		}
+
 		if (new ResultTypeComparator().compare(left, right) < 0) {
 			left.mergeFrom(right);
 			return left;
@ -56,6 +70,18 @@ public class OafMapperUtils {
 		}
 	}

+	private static boolean isFromDelegatedAuthority(Result r) {
+		return Optional
+			.ofNullable(r.getInstance())
+			.map(
+				instance -> instance
+					.stream()
+					.filter(i -> Objects.nonNull(i.getCollectedfrom()))
+					.map(i -> i.getCollectedfrom().getKey())
+					.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
+			.orElse(false);
+	}
+
 	public static KeyValue keyValue(final String k, final String v) {
 		final KeyValue kv = new KeyValue();
 		kv.setKey(k);
@ -95,6 +121,17 @@ public class OafMapperUtils {
 			.collect(Collectors.toList());
 	}

+	public static <T> List<T> listValues(Array values) throws SQLException {
+		if (Objects.isNull(values)) {
+			return null;
+		}
+		return Arrays
+			.stream((T[]) values.getArray())
+			.filter(Objects::nonNull)
+			.distinct()
+			.collect(Collectors.toList());
+	}
+
 	public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
 		return values
 			.stream()
@ -105,7 +142,7 @@ public class OafMapperUtils {
 	}

 	public static Qualifier unknown(final String schemeid, final String schemename) {
-		return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
+		return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
 	}

 	public static AccessRight accessRight(
@ -153,6 +190,17 @@ public class OafMapperUtils {
 		return q;
 	}

+	public static Subject subject(
+		final String value,
+		final String classid,
+		final String classname,
+		final String schemeid,
+		final String schemename,
+		final DataInfo dataInfo) {
+
+		return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
+	}
+
 	public static StructuredProperty structuredProperty(
 		final String value,
 		final String classid,
@ -164,6 +212,20 @@ public class OafMapperUtils {
 		return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
 	}

+	public static Subject subject(
+		final String value,
+		final Qualifier qualifier,
+		final DataInfo dataInfo) {
+		if (value == null) {
+			return null;
+		}
+		final Subject s = new Subject();
+		s.setValue(value);
+		s.setQualifier(qualifier);
+		s.setDataInfo(dataInfo);
+		return s;
+	}
+
 	public static StructuredProperty structuredProperty(
 		final String value,
 		final Qualifier qualifier,
@ -368,4 +430,88 @@ public class OafMapperUtils {
 		}
 		return null;
 	}
+
+	public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
+		KeyValue kv = new KeyValue();
+		kv.setDataInfo(dataInfo);
+		kv.setKey(key);
+		kv.setValue(value);
+		return kv;
+	}
+
+	public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
+		Measure m = new Measure();
+		m.setId(id);
+		m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
+		return m;
+	}
+
+	public static Relation getRelation(final String source,
+		final String target,
+		final String relType,
+		final String subRelType,
+		final String relClass,
+		final OafEntity entity) {
+		return getRelation(source, target, relType, subRelType, relClass, entity, null);
+	}
+
+	public static Relation getRelation(final String source,
+		final String target,
+		final String relType,
+		final String subRelType,
+		final String relClass,
+		final OafEntity entity,
+		final String validationDate) {
+		return getRelation(
+			source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(),
+			entity.getLastupdatetimestamp(), validationDate, null);
+	}
+
+	public static Relation getRelation(final String source,
+		final String target,
+		final String relType,
+		final String subRelType,
+		final String relClass,
+		final List<KeyValue> collectedfrom,
+		final DataInfo dataInfo,
+		final Long lastupdatetimestamp) {
+		return getRelation(
+			source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null);
+	}
+
+	public static Relation getRelation(final String source,
+		final String target,
+		final String relType,
+		final String subRelType,
+		final String relClass,
+		final List<KeyValue> collectedfrom,
+		final DataInfo dataInfo,
+		final Long lastupdatetimestamp,
+		final String validationDate,
+		final List<KeyValue> properties) {
+		final Relation rel = new Relation();
+		rel.setRelType(relType);
+		rel.setSubRelType(subRelType);
+		rel.setRelClass(relClass);
+		rel.setSource(source);
+		rel.setTarget(target);
+		rel.setCollectedfrom(collectedfrom);
+		rel.setDataInfo(dataInfo);
+		rel.setLastupdatetimestamp(lastupdatetimestamp);
+		rel.setValidated(StringUtils.isNotBlank(validationDate));
+		rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
+		rel.setProperties(properties);
+		return rel;
+	}
+
+	public static String getProvenance(DataInfo dataInfo) {
+		return Optional
+			.ofNullable(dataInfo)
+			.map(
+				d -> Optional
+					.ofNullable(d.getProvenanceaction())
+					.map(Qualifier::getClassid)
+					.orElse(""))
+			.orElse("");
+	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java
@ -0,0 +1,46 @@
+
+package eu.dnetlib.dhp.schema.oaf.utils;
+
+import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
+import static org.apache.commons.lang3.StringUtils.isBlank;
+
+import java.util.Comparator;
+
+import eu.dnetlib.dhp.schema.oaf.Subject;
+
+public class SubjectProvenanceComparator implements Comparator<Subject> {
+
+	@Override
+	public int compare(Subject left, Subject right) {
+
+		String lProv = getProvenance(left.getDataInfo());
+		String rProv = getProvenance(right.getDataInfo());
+
+		if (isBlank(lProv) && isBlank(rProv))
+			return 0;
+		if (isBlank(lProv))
+			return 1;
+		if (isBlank(rProv))
+			return -1;
+		if (lProv.equals(rProv))
+			return 0;
+		if (lProv.toLowerCase().contains("crosswalk"))
+			return -1;
+		if (rProv.toLowerCase().contains("crosswalk"))
+			return 1;
+		if (lProv.toLowerCase().contains("user"))
+			return -1;
+		if (rProv.toLowerCase().contains("user"))
+			return 1;
+		if (lProv.toLowerCase().contains("propagation"))
+			return -1;
+		if (rProv.toLowerCase().contains("propagation"))
+			return 1;
+		if (lProv.toLowerCase().contains("iis"))
+			return -1;
+		if (rProv.toLowerCase().contains("iis"))
+			return 1;
+
+		return 0;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@ -75,9 +75,14 @@ public class DHPUtils {

 		final HttpGet req = new HttpGet(url);

+		log.info("MDStoreManager request: {}", req);
+
 		try (final CloseableHttpClient client = HttpClients.createDefault()) {
 			try (final CloseableHttpResponse response = client.execute(req)) {
 				final String json = IOUtils.toString(response.getEntity().getContent());
+
+				log.info("MDStoreManager response: {}", json);
+
 				final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
 				return Arrays
 					.stream(mdstores)
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_maketar_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_maketar_parameters.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt
@ -1,7 +1,8 @@
 van
+von
 der
 de
 dell
 sig
 mr
-mrs
+mrs
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -2,71 +2,72 @@ package eu.dnetlib.dhp.application

 import scala.io.Source

-/**
- * This is the main Interface SparkApplication
- * where all the Spark Scala class should inherit
- *
- */
+/** This is the main Interface SparkApplication
+  * where all the Spark Scala class should inherit
+  */
 trait SparkScalaApplication {
-  /**
-   * This is the path in the classpath of the json
-   * describes all the argument needed to run
-   */
+
+  /** This is the path in the classpath of the json
+    * describes all the argument needed to run
+    */
  val propertyPath: String

-  /**
-   * Utility to parse the arguments using the
-   * property json in the classpath identified from
-   * the variable propertyPath
-   *
-   * @param args the list of arguments
-   */
+  /** Utility to parse the arguments using the
+    * property json in the classpath identified from
+    * the variable propertyPath
+    *
+    * @param args the list of arguments
+    */
  def parseArguments(args: Array[String]): ArgumentApplicationParser = {
-    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
+    val parser = new ArgumentApplicationParser(
+      Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
+    )
    parser.parseArgument(args)
    parser
  }

-  /**
-   * Here all the spark applications runs this method
-   * where the whole logic of the spark node is defined
-   */
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
  def run(): Unit
 }

-
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
 import org.slf4j.Logger

-abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends  SparkScalaApplication {
+abstract class AbstractScalaApplication(
+  val propertyPath: String,
+  val args: Array[String],
+  log: Logger
+) extends SparkScalaApplication {

  var parser: ArgumentApplicationParser = null

-  var spark:SparkSession = null
+  var spark: SparkSession = null

-
-  def initialize():SparkScalaApplication = {
+  def initialize(): SparkScalaApplication = {
    parser = parseArguments(args)
    spark = createSparkSession()
    this
  }

-  /**
-   * Utility for creating a spark session starting from parser
-   *
-   * @return a spark Session
-   */
-  private def createSparkSession():SparkSession = {
-    require(parser!= null)
+  /** Utility for creating a spark session starting from parser
+    *
+    * @return a spark Session
+    */
+  private def createSparkSession(): SparkSession = {
+    require(parser != null)

-    val conf:SparkConf = new SparkConf()
+    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession.builder().config(conf)
+    SparkSession
+      .builder()
+      .config(conf)
      .appName(getClass.getSimpleName)
      .master(master)
      .getOrCreate()
  }

-}
+}
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -14,7 +14,6 @@ import scala.io.Source

 object ScholixUtils extends Serializable {

-
  val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"

  val DATE_RELATION_KEY: String = "RelationDate"
@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
  case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}

  val relations: Map[String, RelationVocabulary] = {
-    val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString
+    val input = Source
+      .fromInputStream(
+        getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
+      )
+      .mkString
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats

    lazy val json: json4s.JValue = parse(input)
@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
    json.extract[Map[String, RelationVocabulary]]
  }

-
  def extractRelationDate(relation: Relation): String = {

    if (relation.getProperties == null || !relation.getProperties.isEmpty)
      null
    else {
-      val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
+      val date = relation.getProperties.asScala
+        .find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
+        .map(p => p.getValue)
      if (date.isDefined)
        date.get
      else
@ -58,78 +62,80 @@ object ScholixUtils extends Serializable {
  def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
    new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)

-
  }

-
-  def generateScholixResourceFromResult(r:Result) :ScholixResource = {
+  def generateScholixResourceFromResult(r: Result): ScholixResource = {
    generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
  }

+  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
+    new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
+      override def zero: RelatedEntities = null

-  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
-    override def zero: RelatedEntities = null
+      override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
+        val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
+        val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0

-    override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
-      val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
-      val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
-
-      if (b == null)
-        RelatedEntities(a._1, relatedDataset, relatedPublication)
-      else
-        RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
-    }
-
-    override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
-      if (b1 != null && b2 != null)
-        RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
-
-      else if (b1 != null)
-        b1
-      else
-        b2
-    }
-
-    override def finish(reduction: RelatedEntities): RelatedEntities = reduction
-
-    override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
-
-    override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
-  }
-
-
-  val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
-    override def zero: Scholix = null
-
-
-    def scholix_complete(s: Scholix): Boolean = {
-      if (s == null || s.getIdentifier == null) {
-        false
-      } else if (s.getSource == null || s.getTarget == null) {
-        false
+        if (b == null)
+          RelatedEntities(a._1, relatedDataset, relatedPublication)
+        else
+          RelatedEntities(
+            a._1,
+            b.relatedDataset + relatedDataset,
+            b.relatedPublication + relatedPublication
+          )
      }
-      else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
-        false
-      else
-        true
+
+      override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
+        if (b1 != null && b2 != null)
+          RelatedEntities(
+            b1.id,
+            b1.relatedDataset + b2.relatedDataset,
+            b1.relatedPublication + b2.relatedPublication
+          )
+        else if (b1 != null)
+          b1
+        else
+          b2
+      }
+
+      override def finish(reduction: RelatedEntities): RelatedEntities = reduction
+
+      override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
+
+      override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
    }

-    override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
-      if (scholix_complete(b)) b else a._2
+  val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
+    new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
+      override def zero: Scholix = null
+
+      def scholix_complete(s: Scholix): Boolean = {
+        if (s == null || s.getIdentifier == null) {
+          false
+        } else if (s.getSource == null || s.getTarget == null) {
+          false
+        } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
+          false
+        else
+          true
+      }
+
+      override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
+        if (scholix_complete(b)) b else a._2
+      }
+
+      override def merge(b1: Scholix, b2: Scholix): Scholix = {
+        if (scholix_complete(b1)) b1 else b2
+      }
+
+      override def finish(reduction: Scholix): Scholix = reduction
+
+      override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
+
+      override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
    }

-    override def merge(b1: Scholix, b2: Scholix): Scholix = {
-      if (scholix_complete(b1)) b1 else b2
-    }
-
-    override def finish(reduction: Scholix): Scholix = reduction
-
-    override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
-
-    override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
-  }
-
-
  def createInverseScholixRelation(scholix: Scholix): Scholix = {
    val s = new Scholix
    s.setPublicationDate(scholix.getPublicationDate)
@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
    s.setRelationship(inverseRelationShip(scholix.getRelationship))
    s.setSource(scholix.getTarget)
    s.setTarget(scholix.getSource)
-    s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+    s.setIdentifier(
+      DHPUtils.md5(
+        s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
+      )
+    )
    s

-
  }

  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
-      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map {
-        d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
+      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
+        new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
      }(collection.breakOut)
      l
    } else List()
@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {

  def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
    if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
-      val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
-        d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
+      val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
+        new ScholixEntityId(
+          d.getDatasourceName,
+          List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
+        )
      }(collection.breakOut)
      l
    } else List()
@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
  def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
    if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {

-
-      val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
-        c =>
-
-          new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
+      val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
+        new ScholixEntityId(
+          c.getValue,
+          List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
+        )
      }.toList
      l
    } else List()
  }

-
  def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
    val s = new Scholix
    s.setPublicationDate(scholix.getPublicationDate)
@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
    s.setRelationship(scholix.getRelationship)
    s.setSource(scholix.getSource)
    s.setTarget(generateScholixResourceFromSummary(target))
-    s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+    s.setIdentifier(
+      DHPUtils.md5(
+        s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
+      )
+    )
    s
  }

-
  def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
    val s = new Scholix
    s.setPublicationDate(scholix.getPublicationDate)
@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
    s.setRelationship(scholix.getRelationship)
    s.setSource(scholix.getSource)
    s.setTarget(target)
-    s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+    s.setIdentifier(
+      DHPUtils.md5(
+        s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
+      )
+    )
    s
  }

-
  def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
    val r = new ScholixResource
    r.setIdentifier(summaryObject.getLocalIdentifier)
@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
      r.setTitle(summaryObject.getTitle.get(0))

    if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
-      val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
+      val l: List[ScholixEntityId] =
+        summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
      if (l.nonEmpty)
        r.setCreator(l.asJava)
    }
@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
    if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
      r.setPublicationDate(summaryObject.getDate.get(0))
    if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
-      val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
+      val plist: List[ScholixEntityId] =
+        summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList

      if (plist.nonEmpty)
        r.setPublisher(plist.asJava)
    }

-
    if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {

-      val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
-        new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
-        , "collected", "complete"
-
-      )).toList
+      val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
+        .map(c =>
+          new ScholixCollectedFrom(
+            new ScholixEntityId(
+              c.getDatasourceName,
+              List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
+            ),
+            "collected",
+            "complete"
+          )
+        )
+        .toList

      if (l.nonEmpty)
        r.setCollectedFrom(l.asJava)
@ -244,9 +269,7 @@ object ScholixUtils extends Serializable {
    r
  }

-
-
-  def scholixFromSource(relation: Relation, source: ScholixResource):Scholix = {
+  def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
    if (relation == null || source == null)
      return null
    val s = new Scholix
@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {

    s.setPublicationDate(d)

-
    if (source.getPublisher != null && !source.getPublisher.isEmpty) {
      s.setPublisher(source.getPublisher)
    }
@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
    val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
    if (semanticRelation == null)
      return null
-    s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
+    s.setRelationship(
+      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+    )
    s.setSource(source)

    s
  }

-
  def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {

    if (relation == null || source == null)
@ -298,12 +321,10 @@ object ScholixUtils extends Serializable {

    s.setPublicationDate(d)

-
    if (source.getPublisher != null && !source.getPublisher.isEmpty) {
      val l: List[ScholixEntityId] = source.getPublisher.asScala
-        .map {
-          p =>
-            new ScholixEntityId(p, null)
+        .map { p =>
+          new ScholixEntityId(p, null)
        }(collection.breakOut)

      if (l.nonEmpty)
@ -313,31 +334,37 @@ object ScholixUtils extends Serializable {
    val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
    if (semanticRelation == null)
      return null
-    s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
+    s.setRelationship(
+      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+    )
    s.setSource(generateScholixResourceFromSummary(source))

    s
  }

+  def findURLForPID(
+    pidValue: List[StructuredProperty],
+    urls: List[String]
+  ): List[(StructuredProperty, String)] = {
+    pidValue.map { p =>
+      val pv = p.getValue

-  def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
-    pidValue.map {
-      p =>
-        val pv = p.getValue
-
-        val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
-        (p, r.orNull)
+      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+      (p, r.orNull)
    }
  }

-
  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
    if (r.getInstance() == null || r.getInstance().isEmpty)
      return List()
-    r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+    r.getInstance()
+      .asScala
+      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
      .filter(i => i.getPid != null && i.getUrl != null)
      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
-      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
+      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+      .distinct
+      .toList
  }

  def resultToSummary(r: Result): ScholixSummary = {
@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
        s.setAuthor(authors.asJava)
    }
    if (r.getInstance() != null) {
-      val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList
+      val dt: List[String] = r
+        .getInstance()
+        .asScala
+        .filter(i => i.getDateofacceptance != null)
+        .map(i => i.getDateofacceptance.getValue)
+        .toList
      if (dt.nonEmpty)
        s.setDate(dt.distinct.asJava)
    }
@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
    }

    if (r.getSubject != null && !r.getSubject.isEmpty) {
-      val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList
+      val subjects: List[SchemeValue] = r.getSubject.asScala
+        .map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
+        .toList
      if (subjects.nonEmpty)
        s.setSubject(subjects.asJava)
    }
@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
      s.setPublisher(List(r.getPublisher.getValue).asJava)

    if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
-      val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList
+      val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
+        .map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
+        .toList
      if (cf.nonEmpty)
        s.setDatasources(cf.distinct.asJava)
    }
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/MdStoreClientTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/MdStoreClientTest.java
@ -0,0 +1,36 @@
+
+package eu.dnetlib.dhp.common;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+public class MdStoreClientTest {
+
+	@Test
+	public void testMongoCollection() throws IOException {
+		final MdstoreClient client = new MdstoreClient("mongodb://localhost:27017", "mdstore");
+
+		final ObjectMapper mapper = new ObjectMapper();
+
+		final List<MDStoreInfo> infos = client.mdStoreWithTimestamp("ODF", "store", "cleaned");
+
+		infos.forEach(System.out::println);
+
+		final String s = mapper.writeValueAsString(infos);
+
+		Path fileName = Paths.get("/Users/sandro/mdstore_info.json");
+
+		// Writing into the file
+		Files.write(fileName, s.getBytes(StandardCharsets.UTF_8));
+
+	}
+}
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java
@ -33,7 +33,7 @@ class ZenodoAPIClientTest {

 		InputStream is = new FileInputStream(file);

-		Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
+		Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));

 		String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));

@ -56,7 +56,7 @@ class ZenodoAPIClientTest {

 		InputStream is = new FileInputStream(file);

-		Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
+		Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));

 		String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));

@ -80,7 +80,7 @@ class ZenodoAPIClientTest {

 		InputStream is = new FileInputStream(file);

-		Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
+		Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));

 		Assertions.assertEquals(202, client.publish());

@ -100,7 +100,7 @@ class ZenodoAPIClientTest {

 		InputStream is = new FileInputStream(file);

-		Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
+		Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));

 		Assertions.assertEquals(202, client.publish());

--- a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
@ -1,100 +0,0 @@
-
-package eu.dnetlib.dhp.oa.merge;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import scala.Tuple2;
-
-class AuthorMergerTest {
-
-	private String publicationsBasePath;
-
-	private List<List<Author>> authors;
-
-	@BeforeEach
-	public void setUp() throws Exception {
-
-		publicationsBasePath = Paths
-			.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
-			.toFile()
-			.getAbsolutePath();
-
-		authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
-			.stream()
-			.map(p -> p._2().getAuthor())
-			.collect(Collectors.toList());
-
-	}
-
-	@Test
-	void mergeTest() { // used in the dedup: threshold set to 0.95
-
-		for (List<Author> authors1 : authors) {
-			System.out.println("List " + (authors.indexOf(authors1) + 1));
-			for (Author author : authors1) {
-				System.out.println(authorToString(author));
-			}
-		}
-
-		List<Author> merge = AuthorMerger.merge(authors);
-
-		System.out.println("Merge ");
-		for (Author author : merge) {
-			System.out.println(authorToString(author));
-		}
-
-		Assertions.assertEquals(7, merge.size());
-
-	}
-
-	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
-		List<Tuple2<String, T>> res = new ArrayList<>();
-		BufferedReader reader;
-		try {
-			reader = new BufferedReader(new FileReader(path));
-			String line = reader.readLine();
-			while (line != null) {
-				res
-					.add(
-						new Tuple2<>(
-							MapDocumentUtil.getJPathString("$.id", line),
-							new ObjectMapper().readValue(line, clazz)));
-				// read next line
-				line = reader.readLine();
-			}
-			reader.close();
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-
-		return res;
-	}
-
-	public String authorToString(Author a) {
-
-		String print = "Fullname = ";
-		print += a.getFullname() + " pid = [";
-		if (a.getPid() != null)
-			for (StructuredProperty sp : a.getPid()) {
-				print += sp.toComparableString() + " ";
-			}
-		print += "]";
-		return print;
-	}
-}
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@ -44,105 +44,104 @@ class OafMapperUtilsTest {
 	@Test
 	void testDateValidation() {

-		assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z  ").isPresent());
-		assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
-		assertTrue(GraphCleaningFunctions.doCleanDate("  2016-04-05").isPresent());
+		assertNotNull(GraphCleaningFunctions.cleanDate("2016-05-07T12:41:19.202Z  "));
+		assertNotNull(GraphCleaningFunctions.cleanDate("2020-09-10 11:08:52 "));
+		assertNotNull(GraphCleaningFunctions.cleanDate("  2016-04-05"));

-		assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
+		assertEquals("2016-04-05", GraphCleaningFunctions.cleanDate("2016 Apr 05"));

-		assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan  2 15:04:05 2006").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan  2 15:04:05 MST 2006").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
-		assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
-		assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
-		assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
+		assertEquals("2009-05-08", GraphCleaningFunctions.cleanDate("May 8, 2009 5:57:51 PM"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, 1970"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, '70"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 1970"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 70"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan  2 15:04:05 2006"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan  2 15:04:05 MST 2006"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 02 15:04:05 -0700 2006"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Monday, 02-Jan-06 15:04:05 MST"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 MST"));
+		assertEquals("2017-07-11", GraphCleaningFunctions.cleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 -0700"));
+		assertEquals("2018-01-04", GraphCleaningFunctions.cleanDate("Thu, 4 Jan 2018 17:53:36 +0000"));
+		assertEquals("2015-08-10", GraphCleaningFunctions.cleanDate("Mon Aug 10 15:44:11 UTC+0100 2015"));
 		assertEquals(
 			"2015-07-03",
-			GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
-		assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
-		assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
-		assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
-		assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
-		assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
-		assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
-		assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
-		assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
-		assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
-		assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
-		assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
-		assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
-		assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
-		assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
-		assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
-		assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
-		assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
-		assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
-		assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
-		assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
-		assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
-		assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
-		assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
-		assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
-		assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
-		assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
-		assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
-		assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
-		assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
-		assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
-		assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
-		assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
-		assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
-		assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
-		assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
-		assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
-		assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
-		assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
-		assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
-		assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
-		assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
-		assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
-		assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
-		assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
-		assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
-		assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
-		assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
+			GraphCleaningFunctions.cleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)"));
+		assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 10:09am"));
+		assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 at 10:09am PST-08"));
+		assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012, 10:10:09"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7, 1970"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7th, 1970"));
+		assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006, 19:17"));
+		assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006 19:17"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 70"));
+		assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 1970"));
+		assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("03 February 2013"));
+		assertEquals("2013-07-01", GraphCleaningFunctions.cleanDate("1 July 2013"));
+		assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("2013-Feb-03"));
+		assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3/31/2014"));
+		assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03/31/2014"));
+		assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08/21/71"));
+		assertEquals("1971-01-08", GraphCleaningFunctions.cleanDate("8/1/71"));
+		assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/2014 22:05"));
+		assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("04/08/2014 22:05"));
+		assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/14 22:05"));
+		assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("04/2/2014 03:00:51"));
+		assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00:00 AM"));
+		assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00:01 PM"));
+		assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00 PM"));
+		assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 1:00 PM"));
+		assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00 AM"));
+		assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("4/02/2014 03:00:51"));
+		assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59"));
+		assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59.3186369"));
+		assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/3/31"));
+		assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/03/31"));
+		assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/4/8 22:05"));
+		assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/04/08 22:05"));
+		assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/04/2 03:00:51"));
+		assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/4/02 03:00:51"));
+		assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59"));
+		assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59.3186369"));
+		assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014年04月08日"));
+		assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("2006-01-02T15:04:05+0000"));
+		assertEquals("2009-08-13", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09-07:00"));
+		assertEquals("2009-08-12", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.3186369"));
+		assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.123"));
+		assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43"));
+		assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43:22"));
+		assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 UTC"));
+		assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 GMT"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 05:24:37 PM"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800 +08"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:44 +09:00"));
+		assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000 +0000 UTC"));
+		assertEquals("2015-09-30", GraphCleaningFunctions.cleanDate("2015-09-30 18:48:56.35272715 +0000 UTC"));
+		assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 GMT"));
+		assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 UTC"));
 		assertEquals(
-			"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
+			"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001"));
 		assertEquals(
-			"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
-		assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
-		assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
-		assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
-		assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
-		assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
-		assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
-		assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
-		assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
-		assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
-		assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
-		assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
-		assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
-		assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
-		assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
-		assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
-		assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
+			"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001"));
+		assertEquals("2017-07-19", GraphCleaningFunctions.cleanDate("2017-07-19 03:21:51+00:00"));
+		assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26"));
+		assertEquals("2014-04-01", GraphCleaningFunctions.cleanDate("2014-04"));
+		assertEquals("2014-01-01", GraphCleaningFunctions.cleanDate("2014"));
+		assertEquals("2014-05-11", GraphCleaningFunctions.cleanDate("2014-05-11 08:20:13,787"));
+		assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3.31.2014"));
+		assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03.31.2014"));
+		assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08.21.71"));
+		assertEquals("2014-03-01", GraphCleaningFunctions.cleanDate("2014.03"));
+		assertEquals("2014-03-30", GraphCleaningFunctions.cleanDate("2014.03.30"));
+		assertEquals("2014-06-01", GraphCleaningFunctions.cleanDate("20140601"));
+		assertEquals("2014-07-22", GraphCleaningFunctions.cleanDate("20140722105203"));
+		assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("1332151919"));
+		assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367189"));
+		assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222"));
+		assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222333"));

 	}

@ -185,6 +184,22 @@ class OafMapperUtilsTest {
 				.getClassid());
 	}

+	@Test
+	void testDelegatedAuthority() throws IOException {
+		Dataset d1 = read("dataset_2.json", Dataset.class);
+		Dataset d2 = read("dataset_delegated.json", Dataset.class);
+
+		assertEquals(1, d2.getCollectedfrom().size());
+		assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
+
+		Result res = OafMapperUtils.mergeResults(d1, d2);
+
+		assertEquals(d2, res);
+
+		System.out.println(OBJECT_MAPPER.writeValueAsString(res));
+
+	}
+
 	protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
 		return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
 	}
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json
@ -1 +1,140 @@
-{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
+{
+  "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
+  "resuttype": {"classid": "dataset"},
+  "pid": [
+    {
+      "qualifier": {"classid": "doi"},
+      "value": "10.1016/j.cmet.2011.03.013"
+    },
+    {
+      "qualifier": {"classid": "urn"},
+      "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
+    },
+    {
+      "qualifier": {"classid": "scp-number"},
+      "value": "79953761260"
+    },
+    {
+      "qualifier": {"classid": "pmc"},
+      "value": "21459329"
+    }
+  ],
+  "collectedfrom": [
+    {
+      "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
+      "value": "Repository B"
+    }
+  ],
+  "instance": [
+    {
+      "refereed": {
+        "classid": "0000",
+        "classname": "UNKNOWN",
+        "schemeid": "dnet:review_levels",
+        "schemename": "dnet:review_levels"
+      },
+      "hostedby": {
+        "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
+        "value": "Zenodo"
+      },
+      "accessright": {
+        "classid": "OPEN",
+        "classname": "Open Access",
+        "schemeid": "dnet:access_modes",
+        "schemename": "dnet:access_modes"
+      },
+      "processingchargecurrency": {
+        "dataInfo": {
+          "provenanceaction": {
+            "classid": "sysimport:crosswalk:datasetarchive",
+            "classname": "Harvested",
+            "schemeid": "dnet:provenanceActions",
+            "schemename": "dnet:provenanceActions"
+          },
+          "deletedbyinference": false,
+          "inferred": false,
+          "inferenceprovenance": "",
+          "invisible": true,
+          "trust": "0.9"
+        },
+        "value": "EUR"
+      },
+      "pid": [
+        {
+          "dataInfo": {
+            "provenanceaction": {
+              "classid": "sysimport:crosswalk:datasetarchive",
+              "classname": "Harvested",
+              "schemeid": "dnet:provenanceActions",
+              "schemename": "dnet:provenanceActions"
+            },
+            "deletedbyinference": false,
+            "inferred": false,
+            "inferenceprovenance": "",
+            "invisible": true,
+            "trust": "0.9"
+          },
+          "qualifier": {
+            "classid": "doi",
+            "classname": "Digital Object Identifier",
+            "schemeid": "dnet:pid_types",
+            "schemename": "dnet:pid_types"
+          },
+          "value": "10.1371/journal.pone.0085605"
+        }
+      ],
+      "distributionlocation": "",
+      "url": ["https://doi.org/10.1371/journal.pone.0085605"],
+      "alternateIdentifier": [
+        {
+          "dataInfo": {
+            "provenanceaction": {
+              "classid": "sysimport:crosswalk:datasetarchive",
+              "classname": "Harvested",
+              "schemeid": "dnet:provenanceActions",
+              "schemename": "dnet:provenanceActions"
+            },
+            "deletedbyinference": false,
+            "inferred": false,
+            "inferenceprovenance": "",
+            "invisible": true,
+            "trust": "0.9"
+          },
+          "qualifier": {
+            "classid": "pmid",
+            "classname": "PubMed ID",
+            "schemeid": "dnet:pid_types",
+            "schemename": "dnet:pid_types"
+          },
+          "value": "24454899.0"
+        }
+      ],
+      "collectedfrom": {
+        "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
+        "value": "Repository B"
+      },
+      "processingchargeamount": {
+        "dataInfo": {
+          "provenanceaction": {
+            "classid": "sysimport:crosswalk:datasetarchive",
+            "classname": "Harvested",
+            "schemeid": "dnet:provenanceActions",
+            "schemename": "dnet:provenanceActions"
+          },
+          "deletedbyinference": false,
+          "inferred": false,
+          "inferenceprovenance": "",
+          "invisible": true,
+          "trust": "0.9"
+        },
+        "value": "1022.02"
+      },
+      "instancetype": {
+        "classid": "0004",
+        "classname": "Conference object",
+        "schemeid": "dnet:publication_resource",
+        "schemename": "dnet:publication_resource"
+      }
+    }
+  ]
+}
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json
@ -0,0 +1,140 @@
+{
+  "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
+  "resuttype": {"classid": "dataset"},
+  "pid": [
+    {
+      "qualifier": {"classid": "doi"},
+      "value": "10.1016/j.cmet.2011.03.013"
+    },
+    {
+      "qualifier": {"classid": "urn"},
+      "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
+    },
+    {
+      "qualifier": {"classid": "scp-number"},
+      "value": "79953761260"
+    },
+    {
+      "qualifier": {"classid": "pmc"},
+      "value": "21459329"
+    }
+  ],
+  "collectedfrom": [
+    {
+      "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
+      "value": "Zenodo"
+    }
+  ],
+  "instance": [
+    {
+      "refereed": {
+        "classid": "0000",
+        "classname": "UNKNOWN",
+        "schemeid": "dnet:review_levels",
+        "schemename": "dnet:review_levels"
+      },
+      "hostedby": {
+        "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
+        "value": "Zenodo"
+      },
+      "accessright": {
+        "classid": "OPEN",
+        "classname": "Open Access",
+        "schemeid": "dnet:access_modes",
+        "schemename": "dnet:access_modes"
+      },
+      "processingchargecurrency": {
+        "dataInfo": {
+          "provenanceaction": {
+            "classid": "sysimport:crosswalk:datasetarchive",
+            "classname": "Harvested",
+            "schemeid": "dnet:provenanceActions",
+            "schemename": "dnet:provenanceActions"
+          },
+          "deletedbyinference": false,
+          "inferred": false,
+          "inferenceprovenance": "",
+          "invisible": true,
+          "trust": "0.9"
+        },
+        "value": "EUR"
+      },
+      "pid": [
+        {
+          "dataInfo": {
+            "provenanceaction": {
+              "classid": "sysimport:crosswalk:datasetarchive",
+              "classname": "Harvested",
+              "schemeid": "dnet:provenanceActions",
+              "schemename": "dnet:provenanceActions"
+            },
+            "deletedbyinference": false,
+            "inferred": false,
+            "inferenceprovenance": "",
+            "invisible": true,
+            "trust": "0.9"
+          },
+          "qualifier": {
+            "classid": "doi",
+            "classname": "Digital Object Identifier",
+            "schemeid": "dnet:pid_types",
+            "schemename": "dnet:pid_types"
+          },
+          "value": "10.1371/journal.pone.0085605"
+        }
+      ],
+      "distributionlocation": "",
+      "url": ["https://doi.org/10.1371/journal.pone.0085605"],
+      "alternateIdentifier": [
+        {
+          "dataInfo": {
+            "provenanceaction": {
+              "classid": "sysimport:crosswalk:datasetarchive",
+              "classname": "Harvested",
+              "schemeid": "dnet:provenanceActions",
+              "schemename": "dnet:provenanceActions"
+            },
+            "deletedbyinference": false,
+            "inferred": false,
+            "inferenceprovenance": "",
+            "invisible": true,
+            "trust": "0.9"
+          },
+          "qualifier": {
+            "classid": "pmid",
+            "classname": "PubMed ID",
+            "schemeid": "dnet:pid_types",
+            "schemename": "dnet:pid_types"
+          },
+          "value": "24454899.0"
+        }
+      ],
+      "collectedfrom": {
+        "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
+        "value": "Zenodo"
+      },
+      "processingchargeamount": {
+        "dataInfo": {
+          "provenanceaction": {
+            "classid": "sysimport:crosswalk:datasetarchive",
+            "classname": "Harvested",
+            "schemeid": "dnet:provenanceActions",
+            "schemename": "dnet:provenanceActions"
+          },
+          "deletedbyinference": false,
+          "inferred": false,
+          "inferenceprovenance": "",
+          "invisible": true,
+          "trust": "0.9"
+        },
+        "value": "1022.02"
+      },
+      "instancetype": {
+        "classid": "0004",
+        "classname": "Conference object",
+        "schemeid": "dnet:publication_resource",
+        "schemename": "dnet:publication_resource"
+      }
+    }
+  ]
+}
--- a/dhp-workflows/dhp-actionmanager/pom.xml
+++ b/dhp-workflows/dhp-actionmanager/pom.xml
@ -4,7 +4,7 @@
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp-workflows</artifactId>
-        <version>1.2.4-SNAPSHOT</version>
+        <version>1.2.5-SNAPSHOT</version>
    </parent>
    <artifactId>dhp-actionmanager</artifactId>

--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -107,7 +107,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=7000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
@ -159,7 +159,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=7000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -107,7 +107,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.sql.shuffle.partitions=7000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -159,7 +159,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.sql.shuffle.partitions=7000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -99,7 +99,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.sql.shuffle.partitions=10000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java
+++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java
@ -172,6 +172,61 @@ public class PromoteActionPayloadForGraphTableJobTest {
 		}
 	}

+	@Test
+	void shouldPromoteActionPayload_custom() throws Exception {
+
+		Class<? extends Oaf> rowClazz = Publication.class;
+		Class<? extends Oaf> actionPayloadClazz = Result.class;
+		MergeAndGet.Strategy strategy = MergeAndGet.Strategy.MERGE_FROM_AND_GET;
+
+		// given
+		Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz);
+		Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz);
+		Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase());
+
+		// when
+		PromoteActionPayloadForGraphTableJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputGraphTablePath",
+					inputGraphTableDir.toString(),
+					"-graphTableClassName",
+					rowClazz.getCanonicalName(),
+					"-inputActionPayloadPath",
+					inputActionPayloadDir.toString(),
+					"-actionPayloadClassName",
+					actionPayloadClazz.getCanonicalName(),
+					"-outputGraphTablePath",
+					outputGraphTableDir.toString(),
+					"-mergeAndGetStrategy",
+					strategy.name(),
+					"--shouldGroupById",
+					"true"
+				});
+
+		// then
+		assertTrue(Files.exists(outputGraphTableDir));
+
+		List<? extends Oaf> actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz)
+			.collectAsList()
+			.stream()
+			.sorted(Comparator.comparingInt(Object::hashCode))
+			.collect(Collectors.toList());
+
+		Publication p = actualOutputRows
+			.stream()
+			.map(o -> (Publication) o)
+			.filter(o -> "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879".equals(o.getId()))
+			.findFirst()
+			.get();
+
+		assertNotNull(p.getMeasures());
+		assertTrue(p.getMeasures().size() > 0);
+
+	}
+
 	public static Stream<Arguments> promoteJobTestParams() {
 		return Stream
 			.of(
--- a/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/input/action_payload/publication_table/result.json
+++ b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/input/action_payload/publication_table/result.json
@ -17,4 +17,5 @@
 {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
 {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
 {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
-{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
+{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"1.64385446761e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"18.9590813696","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"6.00577981643e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"author":null,"resulttype":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":null}
--- a/dhp-workflows/dhp-aggregation/.scalafmt.conf
+++ b/dhp-workflows/dhp-aggregation/.scalafmt.conf
@ -0,0 +1,21 @@
+style = defaultWithAlign
+
+align.openParenCallSite = false
+align.openParenDefnSite = false
+align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
+continuationIndent.callSite = 2
+continuationIndent.defnSite = 2
+danglingParentheses = true
+indentOperator = spray
+maxColumn = 120
+newlines.alwaysBeforeTopLevelStatements = true
+project.excludeFilters = [".*\\.sbt"]
+rewrite.rules = [AvoidInfix]
+rewrite.rules = [ExpandImportSelectors]
+rewrite.rules = [RedundantBraces]
+rewrite.rules = [RedundantParens]
+rewrite.rules = [SortImports]
+rewrite.rules = [SortModifiers]
+rewrite.rules = [PreferCurlyFors]
+spaces.inImportCurlyBraces = false
+unindentTopLevelOperators = true
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@ -4,7 +4,7 @@
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp-workflows</artifactId>
-        <version>1.2.4-SNAPSHOT</version>
+        <version>1.2.5-SNAPSHOT</version>
    </parent>
    <artifactId>dhp-aggregation</artifactId>
    <build>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java
@ -13,19 +13,24 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.Subject;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;

 public class Constants {

 	public static final String DOI = "doi";
+	public static final String DOI_CLASSNAME = "Digital Object Identifier";

 	public static final String DEFAULT_DELIMITER = ",";
+	public static final String DEFAULT_FOS_DELIMITER = "\t";

 	public static final String UPDATE_DATA_INFO_TYPE = "update";
 	public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
 	public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
 	public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
 	public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
+	public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts";
+	public static final String UPDATE_KEY_USAGE_COUNTS = "count";

 	public static final String FOS_CLASS_ID = "FOS";
 	public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
@ -55,13 +60,13 @@ public class Constants {
 			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
 	}

-	public static StructuredProperty getSubject(String sbj, String classid, String classname,
+	public static Subject getSubject(String sbj, String classid, String classname,
 		String diqualifierclassid) {
-		if (sbj.equals(NULL))
+		if (sbj == null || sbj.equals(NULL))
 			return null;
-		StructuredProperty sp = new StructuredProperty();
-		sp.setValue(sbj);
-		sp
+		Subject s = new Subject();
+		s.setValue(sbj);
+		s
 			.setQualifier(
 				OafMapperUtils
 					.qualifier(
@ -69,7 +74,7 @@ public class Constants {
 						classname,
 						ModelConstants.DNET_SUBJECT_TYPOLOGIES,
 						ModelConstants.DNET_SUBJECT_TYPOLOGIES));
-		sp
+		s
 			.setDataInfo(
 				OafMapperUtils
 					.dataInfo(
@ -85,7 +90,7 @@ public class Constants {
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						""));

-		return sp;
+		return s;

 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java
@ -1,7 +1,7 @@

 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;

-import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
+import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_FOS_DELIMITER;
 import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

@ -9,8 +9,6 @@ import java.io.Serializable;
 import java.util.Optional;

 import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
@ -49,7 +47,7 @@ public class GetFOSSparkJob implements Serializable {

 		final String delimiter = Optional
 			.ofNullable(parser.get("delimiter"))
-			.orElse(DEFAULT_DELIMITER);
+			.orElse(DEFAULT_FOS_DELIMITER);

 		SparkConf sconf = new SparkConf();
 		runWithSparkSession(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
@ -33,6 +33,7 @@ import eu.dnetlib.dhp.schema.oaf.Instance;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Measure;
 import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.utils.DHPUtils;

@ -95,11 +96,39 @@ public class PrepareBipFinder implements Serializable {
 			}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
 			.map((MapFunction<BipScore, Result>) v -> {
 				Result r = new Result();
+				final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId());

 				r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
 				Instance inst = new Instance();
 				inst.setMeasures(getMeasure(v));
+
+				inst
+					.setPid(
+						Arrays
+							.asList(
+								OafMapperUtils
+									.structuredProperty(
+										cleanedPid,
+										OafMapperUtils
+											.qualifier(
+												DOI, DOI_CLASSNAME,
+												ModelConstants.DNET_PID_TYPES,
+												ModelConstants.DNET_PID_TYPES),
+										null)));
 				r.setInstance(Arrays.asList(inst));
+				r
+					.setDataInfo(
+						OafMapperUtils
+							.dataInfo(
+								false, null, true,
+								false,
+								OafMapperUtils
+									.qualifier(
+										ModelConstants.PROVENANCE_ENRICH,
+										null,
+										ModelConstants.DNET_PROVENANCE_ACTIONS,
+										ModelConstants.DNET_PROVENANCE_ACTIONS),
+								null));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -21,8 +21,11 @@ import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.Subject;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.utils.DHPUtils;

 public class PrepareFOSSparkJob implements Serializable {
@ -71,16 +74,30 @@ public class PrepareFOSSparkJob implements Serializable {
 				Result r = new Result();
 				FOSDataModel first = it.next();
 				r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
+
 				HashSet<String> level1 = new HashSet<>();
 				HashSet<String> level2 = new HashSet<>();
 				HashSet<String> level3 = new HashSet<>();
 				addLevels(level1, level2, level3, first);
 				it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
-				List<StructuredProperty> sbjs = new ArrayList<>();
+				List<Subject> sbjs = new ArrayList<>();
 				level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
 				level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
 				level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
 				r.setSubject(sbjs);
+				r
+					.setDataInfo(
+						OafMapperUtils
+							.dataInfo(
+								false, null, true,
+								false,
+								OafMapperUtils
+									.qualifier(
+										ModelConstants.PROVENANCE_ENRICH,
+										null,
+										ModelConstants.DNET_PROVENANCE_ACTIONS,
+										ModelConstants.DNET_PROVENANCE_ACTIONS),
+								null));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java
@ -21,8 +21,11 @@ import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.Subject;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.utils.DHPUtils;

 public class PrepareSDGSparkJob implements Serializable {
@ -71,13 +74,26 @@ public class PrepareSDGSparkJob implements Serializable {
 				Result r = new Result();
 				r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
 				SDGDataModel first = it.next();
-				List<StructuredProperty> sbjs = new ArrayList<>();
+				List<Subject> sbjs = new ArrayList<>();
 				sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
 				it
 					.forEachRemaining(
 						s -> sbjs
 							.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
 				r.setSubject(sbjs);
+				r
+					.setDataInfo(
+						OafMapperUtils
+							.dataInfo(
+								false, null, true,
+								false,
+								OafMapperUtils
+									.qualifier(
+										ModelConstants.PROVENANCE_ENRICH,
+										null,
+										ModelConstants.DNET_PROVENANCE_ACTIONS,
+										ModelConstants.DNET_PROVENANCE_ACTIONS),
+								null));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.actionmanager.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 import java.io.Serializable;
+import java.util.Optional;

 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -67,7 +68,19 @@ public class SparkSaveUnresolved implements Serializable {
 			.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
 				Result ret = it.next();
-				it.forEachRemaining(r -> ret.mergeFrom(r));
+				it.forEachRemaining(r -> {
+					if (r.getInstance() != null) {
+						ret.setInstance(r.getInstance());
+					}
+					if (r.getSubject() != null) {
+						if (ret.getSubject() != null)
+							ret.getSubject().addAll(r.getSubject());
+						else
+							ret.setSubject(r.getSubject());
+					}
+
+					// ret.mergeFrom(r)
+				});
 				return ret;
 			}, Encoders.bean(Result.class))
 			.write()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java
@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

+import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -83,10 +85,13 @@ public class CreateActionSetSparkJob implements Serializable {
 	private static void extractContent(SparkSession spark, String inputPath, String outputPath,
 		boolean shouldDuplicateRels) {
 		spark
-			.sqlContext()
-			.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
+			.read()
+			.textFile(inputPath + "/*")
+			.map(
+				(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
+				Encoders.bean(COCI.class))
 			.flatMap(
-				(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
+				(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
 				Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) value -> value != null)
 			.toJavaRDD()
@ -98,26 +103,30 @@ public class CreateActionSetSparkJob implements Serializable {

 	}

-	private static List<Relation> createRelation(String value, boolean duplicate) {
-		String[] line = value.split(",");
-		if (!line[1].startsWith("10.")) {
-			return new ArrayList<>();
-		}
+	private static List<Relation> createRelation(COCI value, boolean duplicate) {
+
 		List<Relation> relationList = new ArrayList<>();

-		String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
-		final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
+		String citing = ID_PREFIX
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
+		final String cited = ID_PREFIX
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));

-		relationList
-			.addAll(
-				getRelations(
-					citing,
-					cited));
+		if (!citing.equals(cited)) {
+			relationList
+				.add(
+					getRelation(
+						citing,
+						cited, ModelConstants.CITES));

-		if (duplicate && line[1].endsWith(".refs")) {
-			citing = ID_PREFIX + IdentifierFactory
-				.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
-			relationList.addAll(getRelations(citing, cited));
+			if (duplicate && value.getCiting().endsWith(".refs")) {
+				citing = ID_PREFIX + IdentifierFactory
+					.md5(
+						CleaningFunctions
+							.normalizePidValue(
+								"doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
+				relationList.add(getRelation(citing, cited, ModelConstants.CITES));
+			}
 		}

 		return relationList;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java
@ -0,0 +1,103 @@
+
+package eu.dnetlib.dhp.actionmanager.opencitations;
+
+import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
+import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class ReadCOCI implements Serializable {
+
+	private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
+
+	public static void main(String[] args) throws Exception {
+		String jsonConfiguration = IOUtils
+			.toString(
+				ReadCOCI.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+		parser.parseArgument(args);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath: {}", outputPath);
+
+		final String[] inputFile = parser.get("inputFile").split(";");
+		log.info("inputFile {}", inputFile.toString());
+		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String workingPath = parser.get("workingPath");
+		log.info("workingPath {}", workingPath);
+
+		SparkConf sconf = new SparkConf();
+
+		final String delimiter = Optional
+			.ofNullable(parser.get("delimiter"))
+			.orElse(DEFAULT_DELIMITER);
+
+		runWithSparkSession(
+			sconf,
+			isSparkSessionManaged,
+			spark -> {
+				doRead(
+					spark,
+					workingPath,
+					inputFile,
+					outputPath,
+					delimiter);
+			});
+	}
+
+	private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
+		String outputPath,
+		String delimiter) throws IOException {
+
+		for (String inputFile : inputFiles) {
+			String p_string = workingPath + "/" + inputFile + ".gz";
+
+			Dataset<Row> cociData = spark
+				.read()
+				.format("csv")
+				.option("sep", delimiter)
+				.option("inferSchema", "true")
+				.option("header", "true")
+				.option("quotes", "\"")
+				.load(p_string)
+				.repartition(100);
+
+			cociData.map((MapFunction<Row, COCI>) row -> {
+				COCI coci = new COCI();
+				coci.setOci(row.getString(0));
+				coci.setCiting(row.getString(1));
+				coci.setCited(row.getString(2));
+				return coci;
+			}, Encoders.bean(COCI.class))
+				.write()
+				.mode(SaveMode.Overwrite)
+				.option("compression", "gzip")
+				.json(outputPath + inputFile);
+		}
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/model/COCI.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/model/COCI.java
@ -0,0 +1,39 @@
+
+package eu.dnetlib.dhp.actionmanager.opencitations.model;
+
+import java.io.Serializable;
+
+import com.opencsv.bean.CsvBindByPosition;
+
+public class COCI implements Serializable {
+	private String oci;
+
+	private String citing;
+
+	private String cited;
+
+	public String getOci() {
+		return oci;
+	}
+
+	public void setOci(String oci) {
+		this.oci = oci;
+	}
+
+	public String getCiting() {
+		return citing;
+	}
+
+	public void setCiting(String citing) {
+		this.citing = citing;
+	}
+
+	public String getCited() {
+		return cited;
+	}
+
+	public void setCited(String cited) {
+		this.cited = cited;
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java
@ -266,11 +266,15 @@ public class PrepareProgramme {

 			String code = csvProgramme.getCode();
 			if (!code.endsWith(".") && !code.contains("Euratom")
-				&& !code.equals("H2020-EC"))
+				&& !code.equals("H2020-EC") && !code.equals("H2020") &&
+				!code.equals("H2020-Topics"))
 				code += ".";

-			csvProgramme.setClassification(map.get(code)._1());
-			csvProgramme.setClassification_short(map.get(code)._2());
+			if (map.containsKey(code)) {
+				csvProgramme.setClassification(map.get(code)._1());
+				csvProgramme.setClassification_short(map.get(code)._2());
+			} else
+				log.info("WARNING: No entry in map for code " + code);

 			return csvProgramme;
 		}).collect();
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java
@ -3,12 +3,23 @@ package eu.dnetlib.dhp.actionmanager.project;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

+import java.io.BufferedOutputStream;
+import java.io.IOException;
 import java.util.*;
+import java.util.zip.GZIPOutputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;

 import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
@ -19,6 +30,7 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
+import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import scala.Tuple2;
@ -54,6 +66,9 @@ public class PrepareProjects {
 		final String projectPath = parser.get("projectPath");
 		log.info("projectPath {}: ", projectPath);

+		final String workingPath = parser.get("workingPath");
+		log.info("workingPath {}: ", workingPath);
+
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}: ", outputPath);

@ -76,7 +91,7 @@ public class PrepareProjects {
 	}

 	private static void exec(SparkSession spark, String projectPath, String dbProjectPath, String outputPath) {
-		Dataset<CSVProject> project = readPath(spark, projectPath, CSVProject.class);
+		Dataset<Project> project = readPath(spark, projectPath, Project.class);
 		Dataset<ProjectSubset> dbProjects = readPath(spark, dbProjectPath, ProjectSubset.class);

 		dbProjects
@ -90,14 +105,14 @@ public class PrepareProjects {

 	}

-	private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
+	private static FlatMapFunction<Tuple2<ProjectSubset, Project>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
 		return value -> {
-			Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
 			List<CSVProject> csvProjectList = new ArrayList<>();
-			if (csvProject.isPresent()) {
+			if (Optional.ofNullable(value._2()).isPresent()) {
+				Project project = value._2();

-				String[] programme = csvProject.get().getProgramme().split(";");
-				String topic = csvProject.get().getTopics();
+				String[] programme = project.getLegalBasis().split(";");
+				String topic = project.getTopics();

 				Arrays
 					.stream(programme)
@ -106,7 +121,7 @@ public class PrepareProjects {
 						proj.setTopics(topic);

 						proj.setProgramme(p);
-						proj.setId(csvProject.get().getId());
+						proj.setId(project.getId());
 						csvProjectList.add(proj);
 					});
 			}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java
@ -24,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
 import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
 import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic;
+import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
@ -110,7 +111,7 @@ public class SparkAtomicActionJob {

 		Dataset<CSVProject> project = readPath(spark, projectPatH, CSVProject.class);
 		Dataset<CSVProgramme> programme = readPath(spark, programmePath, CSVProgramme.class);
-		Dataset<EXCELTopic> topic = readPath(spark, topicPath, EXCELTopic.class);
+		Dataset<JsonTopic> topic = readPath(spark, topicPath, JsonTopic.class);

 		Dataset<Project> aaproject = project
 			.joinWith(programme, project.col("programme").equalTo(programme.col("code")), "left")
@ -124,9 +125,7 @@ public class SparkAtomicActionJob {
 						Project pp = new Project();
 						pp
 							.setId(
-								createOpenaireId(
-									ModelSupport.entityIdPrefix.get("project"),
-									"corda__h2020", csvProject.getId()));
+								csvProject.getId());
 						pp.setH2020topiccode(csvProject.getTopics());
 						H2020Programme pm = new H2020Programme();
 						H2020Classification h2020classification = new H2020Classification();
@ -144,10 +143,15 @@ public class SparkAtomicActionJob {
 			.filter(Objects::nonNull);

 		aaproject
-			.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
-			.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
-				Optional<EXCELTopic> op = Optional.ofNullable(p._2());
+			.joinWith(topic, aaproject.col("id").equalTo(topic.col("projectID")), "left")
+			.map((MapFunction<Tuple2<Project, JsonTopic>, Project>) p -> {
+				Optional<JsonTopic> op = Optional.ofNullable(p._2());
 				Project rp = p._1();
+				rp
+					.setId(
+						createOpenaireId(
+							ModelSupport.entityIdPrefix.get("project"),
+							"corda__h2020", rp.getId()));
 				op.ifPresent(excelTopic -> rp.setH2020topicdescription(excelTopic.getTitle()));
 				return rp;
 			}, Encoders.bean(Project.class))
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java
@ -22,6 +22,7 @@ import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic;
 /**
 * Reads a generic excel file and maps it into classes that mirror its schema
 */
+@Deprecated
 public class EXCELParser {

 	public <R> List<R> parse(InputStream file, String classForName, String sheetName)
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ExtractFromZip.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ExtractFromZip.java
@ -0,0 +1,101 @@
+
+package eu.dnetlib.dhp.actionmanager.project.utils;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
+import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+/**
+ * @author miriam.baglioni
+ * @Date 28/02/23
+ */
+public class ExtractFromZip implements Serializable {
+	private static final Logger log = LoggerFactory.getLogger(PrepareProjects.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				PrepareProjects.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/project/extract_fromzip_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}: ", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		final String hdfsNameNode = parser.get("hdfsNameNode");
+		log.info("hdfsNameNode {}", hdfsNameNode);
+
+		Configuration conf = new Configuration();
+		conf.set("fs.defaultFS", hdfsNameNode);
+
+		FileSystem fs = FileSystem.get(conf);
+
+		doExtract(inputPath, outputPath, fs);
+
+	}
+
+	private static void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
+		throws IOException {
+
+		final Path path = new Path(inputFile);
+
+		FSDataInputStream project_zip = fileSystem.open(path);
+
+		try (ZipInputStream zis = new ZipInputStream(project_zip)) {
+			ZipEntry entry = null;
+			while ((entry = zis.getNextEntry()) != null) {
+
+				if (!entry.isDirectory()) {
+					String fileName = entry.getName();
+					byte buffer[] = new byte[1024];
+					int count;
+
+					try (
+						FSDataOutputStream out = fileSystem
+							.create(new Path(workingPath + fileName))) {
+
+						while ((count = zis.read(buffer, 0, buffer.length)) != -1)
+							out.write(buffer, 0, count);
+
+					}
+
+				}
+
+			}
+
+		}
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
@ -6,7 +6,9 @@ import java.util.Optional;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.collection.GetCSV;
@ -40,8 +42,11 @@ public class ReadCSV {
 		conf.set("fs.defaultFS", hdfsNameNode);

 		FileSystem fileSystem = FileSystem.get(conf);
+
+		FSDataInputStream inputStream = fileSystem.open(new Path(fileURL));
+
 		BufferedReader reader = new BufferedReader(
-			new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL)));
+			new InputStreamReader(inputStream));

 		GetCSV.getCsv(fileSystem, reader, hdfsPath, classForName, del);

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadProjects.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadProjects.java
@ -0,0 +1,90 @@
+
+package eu.dnetlib.dhp.actionmanager.project.utils;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
+import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+/**
+ * @author miriam.baglioni
+ * @Date 28/02/23
+ */
+public class ReadProjects implements Serializable {
+	private static final Logger log = LoggerFactory.getLogger(ReadProjects.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				PrepareProjects.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/project/read_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}: ", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		final String hdfsNameNode = parser.get("hdfsNameNode");
+		log.info("hdfsNameNode {}", hdfsNameNode);
+
+		Configuration conf = new Configuration();
+		conf.set("fs.defaultFS", hdfsNameNode);
+
+		FileSystem fs = FileSystem.get(conf);
+
+		readProjects(inputPath, outputPath, fs);
+	}
+
+	public static void readProjects(String inputFile, String workingPath, FileSystem fs) throws IOException {
+		Path hdfsreadpath = new Path(inputFile);
+
+		FSDataInputStream inputStream = fs.open(hdfsreadpath);
+
+		ArrayList<Project> projects = OBJECT_MAPPER
+			.readValue(
+				IOUtils.toString(inputStream, "UTF-8"),
+				new TypeReference<List<Project>>() {
+				});
+
+		Path hdfsWritePath = new Path(workingPath);
+
+		if (fs.exists(hdfsWritePath)) {
+			fs.delete(hdfsWritePath, false);
+		}
+		FSDataOutputStream fos = fs.create(hdfsWritePath);
+
+		try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
+
+			for (Project p : projects) {
+				writer.write(OBJECT_MAPPER.writeValueAsString(p));
+				writer.newLine();
+			}
+		}
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadTopics.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadTopics.java
@ -0,0 +1,92 @@
+
+package eu.dnetlib.dhp.actionmanager.project.utils;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
+import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
+import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+/**
+ * @author miriam.baglioni
+ * @Date 28/02/23
+ */
+public class ReadTopics implements Serializable {
+	private static final Logger log = LoggerFactory.getLogger(ReadTopics.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				PrepareProjects.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/project/read_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}: ", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		final String hdfsNameNode = parser.get("hdfsNameNode");
+		log.info("hdfsNameNode {}", hdfsNameNode);
+
+		Configuration conf = new Configuration();
+		conf.set("fs.defaultFS", hdfsNameNode);
+
+		FileSystem fs = FileSystem.get(conf);
+
+		readTopics(inputPath, outputPath, fs);
+	}
+
+	public static void readTopics(String inputFile, String workingPath, FileSystem fs) throws IOException {
+		Path hdfsreadpath = new Path(inputFile);
+
+		FSDataInputStream inputStream = fs.open(hdfsreadpath);
+
+		ArrayList<JsonTopic> topics = OBJECT_MAPPER
+			.readValue(
+				IOUtils.toString(inputStream, "UTF-8"),
+				new TypeReference<List<JsonTopic>>() {
+				});
+
+		Path hdfsWritePath = new Path(workingPath);
+
+		if (fs.exists(hdfsWritePath)) {
+			fs.delete(hdfsWritePath, false);
+		}
+		FSDataOutputStream fos = fs.create(hdfsWritePath);
+
+		try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
+
+			for (JsonTopic p : topics) {
+				writer.write(OBJECT_MAPPER.writeValueAsString(p));
+				writer.newLine();
+			}
+		}
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProject.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProject.java
@ -13,7 +13,7 @@ public class CSVProject implements Serializable {
 	@CsvBindByName(column = "id")
 	private String id;

-	@CsvBindByName(column = "programme")
+	@CsvBindByName(column = "legalBasis")
 	private String programme;

 	@CsvBindByName(column = "topics")
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/EXCELTopic.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/EXCELTopic.java
@ -6,6 +6,7 @@ import java.io.Serializable;
 /**
 * the model class for the topic excel file
 */
+@Deprecated
 public class EXCELTopic implements Serializable {
 	private String rcn;
 	private String language;
@ -17,9 +18,27 @@ public class EXCELTopic implements Serializable {
 	private String title;
 	private String shortTitle;
 	private String objective;
-	private String subjects;
+	private String keywords;
 	private String legalBasis;
 	private String call;
+	private String id;
+	private String contentUpdateDate;
+
+	public String getContentUpdateDate() {
+		return contentUpdateDate;
+	}
+
+	public void setContentUpdateDate(String contentUpdateDate) {
+		this.contentUpdateDate = contentUpdateDate;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}

 	public String getRcn() {
 		return rcn;
@ -101,12 +120,12 @@ public class EXCELTopic implements Serializable {
 		this.objective = objective;
 	}

-	public String getSubjects() {
-		return subjects;
+	public String getKeywords() {
+		return keywords;
 	}

-	public void setSubjects(String subjects) {
-		this.subjects = subjects;
+	public void setKeywords(String keywords) {
+		this.keywords = keywords;
 	}

 	public String getLegalBasis() {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/JsonTopic.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/JsonTopic.java
@ -0,0 +1,38 @@
+
+package eu.dnetlib.dhp.actionmanager.project.utils.model;
+
+import java.io.Serializable;
+
+/**
+ * @author miriam.baglioni
+ * @Date 28/02/23
+ */
+public class JsonTopic implements Serializable {
+	private String projectID;
+	private String title;
+	private String topic;
+
+	public String getProjectID() {
+		return projectID;
+	}
+
+	public void setProjectID(String projectID) {
+		this.projectID = projectID;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getTopic() {
+		return topic;
+	}
+
+	public void setTopic(String topic) {
+		this.topic = topic;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/Project.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/Project.java
@ -0,0 +1,191 @@
+
+package eu.dnetlib.dhp.actionmanager.project.utils.model;
+
+import java.io.Serializable;
+
+/**
+ * @author miriam.baglioni
+ * @Date 24/02/23
+ */
+public class Project implements Serializable {
+	private String acronym;
+	private String contentUpdateDate;
+	private String ecMaxContribution;
+	private String ecSignatureDate;
+	private String endDate;
+	private String frameworkProgramme;
+	private String fundingScheme;
+	private String grantDoi;
+	private String id;
+	private String legalBasis;
+	private String masterCall;
+	private String nature;
+	private String objective;
+	private String rcn;
+	private String startDate;
+	private String status;
+	private String subCall;
+	private String title;
+	private String topics;
+	private String totalCost;
+
+	public String getAcronym() {
+		return acronym;
+	}
+
+	public void setAcronym(String acronym) {
+		this.acronym = acronym;
+	}
+
+	public String getContentUpdateDate() {
+		return contentUpdateDate;
+	}
+
+	public void setContentUpdateDate(String contentUpdateDate) {
+		this.contentUpdateDate = contentUpdateDate;
+	}
+
+	public String getEcMaxContribution() {
+		return ecMaxContribution;
+	}
+
+	public void setEcMaxContribution(String ecMaxContribution) {
+		this.ecMaxContribution = ecMaxContribution;
+	}
+
+	public String getEcSignatureDate() {
+		return ecSignatureDate;
+	}
+
+	public void setEcSignatureDate(String ecSignatureDate) {
+		this.ecSignatureDate = ecSignatureDate;
+	}
+
+	public String getEndDate() {
+		return endDate;
+	}
+
+	public void setEndDate(String endDate) {
+		this.endDate = endDate;
+	}
+
+	public String getFrameworkProgramme() {
+		return frameworkProgramme;
+	}
+
+	public void setFrameworkProgramme(String frameworkProgramme) {
+		this.frameworkProgramme = frameworkProgramme;
+	}
+
+	public String getFundingScheme() {
+		return fundingScheme;
+	}
+
+	public void setFundingScheme(String fundingScheme) {
+		this.fundingScheme = fundingScheme;
+	}
+
+	public String getGrantDoi() {
+		return grantDoi;
+	}
+
+	public void setGrantDoi(String grantDoi) {
+		this.grantDoi = grantDoi;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public String getLegalBasis() {
+		return legalBasis;
+	}
+
+	public void setLegalBasis(String legalBasis) {
+		this.legalBasis = legalBasis;
+	}
+
+	public String getMasterCall() {
+		return masterCall;
+	}
+
+	public void setMasterCall(String masterCall) {
+		this.masterCall = masterCall;
+	}
+
+	public String getNature() {
+		return nature;
+	}
+
+	public void setNature(String nature) {
+		this.nature = nature;
+	}
+
+	public String getObjective() {
+		return objective;
+	}
+
+	public void setObjective(String objective) {
+		this.objective = objective;
+	}
+
+	public String getRcn() {
+		return rcn;
+	}
+
+	public void setRcn(String rcn) {
+		this.rcn = rcn;
+	}
+
+	public String getStartDate() {
+		return startDate;
+	}
+
+	public void setStartDate(String startDate) {
+		this.startDate = startDate;
+	}
+
+	public String getStatus() {
+		return status;
+	}
+
+	public void setStatus(String status) {
+		this.status = status;
+	}
+
+	public String getSubCall() {
+		return subCall;
+	}
+
+	public void setSubCall(String subCall) {
+		this.subCall = subCall;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getTopics() {
+		return topics;
+	}
+
+	public void setTopics(String topics) {
+		this.topics = topics;
+	}
+
+	public String getTotalCost() {
+		return totalCost;
+	}
+
+	public void setTotalCost(String totalCost) {
+		this.totalCost = totalCost;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java
@ -29,8 +29,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Dataset;
+import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
@ -38,16 +37,17 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

-import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
 import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
 import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.Constants;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.Organization;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
@ -60,10 +60,8 @@ public class GenerateRorActionSetJob {

 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

-	private static final String ROR_NS_PREFIX = "ror_________";
-
 	private static final List<KeyValue> ROR_COLLECTED_FROM = listKeyValues(
-		"10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR");
+		Constants.ROR_OPENAIRE_ID, Constants.ROR_DATASOURCE_NAME);

 	private static final DataInfo ROR_DATA_INFO = dataInfo(
 		false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92");
@ -112,25 +110,22 @@ public class GenerateRorActionSetJob {
 		final String outputPath) throws IOException {

 		readInputPath(spark, inputPath)
-			.map(
-				(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
-				Encoders.bean(Organization.class))
-			.toJavaRDD()
-			.map(o -> new AtomicAction<>(Organization.class, o))
+			.map(GenerateRorActionSetJob::convertRorOrg)
+			.flatMap(List::iterator)
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
 	}

-	protected static Organization convertRorOrg(final RorOrganization r) {
+	protected static List<AtomicAction<? extends Oaf>> convertRorOrg(final RorOrganization r) {

 		final Date now = new Date();

 		final Organization o = new Organization();

-		o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
-		o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
+		o.setId(calculateOpenaireId(r.getId()));
+		o.setOriginalId(Arrays.asList(String.format("%s::%s", Constants.ROR_NS_PREFIX, r.getId())));
 		o.setCollectedfrom(ROR_COLLECTED_FROM);
 		o.setPid(pids(r));
 		o.setDateofcollection(now.toString());
@ -166,7 +161,15 @@ public class GenerateRorActionSetJob {
 		o.setDataInfo(ROR_DATA_INFO);
 		o.setLastupdatetimestamp(now.getTime());

-		return o;
+		final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
+		res.add(new AtomicAction<>(Organization.class, o));
+
+		return res;
+
+	}
+
+	private static String calculateOpenaireId(final String rorId) {
+		return String.format("20|%s::%s", Constants.ROR_NS_PREFIX, DHPUtils.md5(rorId));
 	}

 	private static List<StructuredProperty> pids(final RorOrganization r) {
@ -202,14 +205,14 @@ public class GenerateRorActionSetJob {
 			.collect(Collectors.toList());
 	}

-	private static Dataset<RorOrganization> readInputPath(
+	private static JavaRDD<RorOrganization> readInputPath(
 		final SparkSession spark,
 		final String path) throws IOException {

 		try (final FileSystem fileSystem = FileSystem.get(new Configuration());
 			final InputStream is = fileSystem.open(new Path(path))) {
 			final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
-			return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
+			return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class)).toJavaRDD();
 		}
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java
@ -0,0 +1,186 @@
+
+package eu.dnetlib.dhp.actionmanager.usagestats;
+
+import static eu.dnetlib.dhp.actionmanager.Constants.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import scala.Tuple2;
+
+/**
+ * created the Atomic Action for each type of results
+ */
+public class SparkAtomicActionUsageJob implements Serializable {
+
+	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static <I extends Result> void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				SparkAtomicActionUsageJob.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		SparkConf conf = new SparkConf();
+		conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+		final String dbname = parser.get("usagestatsdb");
+
+		final String workingPath = parser.get("workingPath");
+
+		runWithSparkHiveSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				removeOutputDir(spark, outputPath);
+				prepareData(dbname, spark, workingPath + "/usageDb", "usage_stats", "result_id");
+				prepareData(dbname, spark, workingPath + "/projectDb", "project_stats", "id");
+				prepareData(dbname, spark, workingPath + "/datasourceDb", "datasource_stats", "repositor_id");
+				writeActionSet(spark, workingPath, outputPath);
+			});
+	}
+
+	private static void prepareData(String dbname, SparkSession spark, String workingPath, String tableName,
+		String attribute_name) {
+		spark
+			.sql(
+				String
+					.format(
+						"select %s as id, sum(downloads) as downloads, sum(views) as views " +
+							"from %s.%s group by %s",
+						attribute_name, dbname, tableName, attribute_name))
+			.as(Encoders.bean(UsageStatsModel.class))
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(workingPath);
+	}
+
+	public static void writeActionSet(SparkSession spark, String inputPath, String outputPath) {
+		getFinalIndicatorsResult(spark, inputPath + "/usageDb")
+			.toJavaRDD()
+			.map(p -> new AtomicAction(p.getClass(), p))
+			.union(
+				getFinalIndicatorsProject(spark, inputPath + "/projectDb")
+					.toJavaRDD()
+					.map(p -> new AtomicAction(p.getClass(), p)))
+			.union(
+				getFinalIndicatorsDatasource(spark, inputPath + "/datasourceDb")
+					.toJavaRDD()
+					.map(p -> new AtomicAction(p.getClass(), p)))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+
+	}
+
+	private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) {
+
+		return readPath(spark, inputPath, UsageStatsModel.class)
+			.map((MapFunction<UsageStatsModel, Result>) usm -> {
+				Result r = new Result();
+				r.setId("50|" + usm.getId());
+				r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
+				return r;
+			}, Encoders.bean(Result.class));
+	}
+
+	private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) {
+
+		return readPath(spark, inputPath, UsageStatsModel.class)
+			.map((MapFunction<UsageStatsModel, Project>) usm -> {
+				Project p = new Project();
+				p.setId("40|" + usm.getId());
+				p.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
+				return p;
+			}, Encoders.bean(Project.class));
+	}
+
+	private static Dataset<Datasource> getFinalIndicatorsDatasource(SparkSession spark, String inputPath) {
+
+		return readPath(spark, inputPath, UsageStatsModel.class)
+			.map((MapFunction<UsageStatsModel, Datasource>) usm -> {
+				Datasource d = new Datasource();
+				d.setId("10|" + usm.getId());
+				d.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
+				return d;
+			}, Encoders.bean(Datasource.class));
+	}
+
+	private static List<Measure> getMeasure(Long downloads, Long views) {
+		DataInfo dataInfo = OafMapperUtils
+			.dataInfo(
+				false,
+				UPDATE_DATA_INFO_TYPE,
+				true,
+				false,
+				OafMapperUtils
+					.qualifier(
+						UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
+						UPDATE_CLASS_NAME,
+						ModelConstants.DNET_PROVENANCE_ACTIONS,
+						ModelConstants.DNET_PROVENANCE_ACTIONS),
+				"");
+
+		return Arrays
+			.asList(
+				OafMapperUtils
+					.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),
+				OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));
+
+	}
+
+	private static void removeOutputDir(SparkSession spark, String path) {
+		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+	}
+
+	public static <R> Dataset<R> readPath(
+		SparkSession spark, String inputPath, Class<R> clazz) {
+		return spark
+			.read()
+			.textFile(inputPath)
+			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/UsageStatsModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/UsageStatsModel.java
@ -0,0 +1,34 @@
+
+package eu.dnetlib.dhp.actionmanager.usagestats;
+
+import java.io.Serializable;
+
+public class UsageStatsModel implements Serializable {
+	private String id;
+	private Long downloads;
+	private Long views;
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public Long getDownloads() {
+		return downloads;
+	}
+
+	public void setDownloads(Long downloads) {
+		this.downloads = downloads;
+	}
+
+	public Long getViews() {
+		return views;
+	}
+
+	public void setViews(Long views) {
+		this.views = views;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java
@ -19,6 +19,8 @@ import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
 import eu.dnetlib.dhp.aggregation.common.ReportingJob;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
 import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
 import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
 import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob {
 				return new OaiCollectorPlugin(clientParams);
 			case rest_json2xml:
 				return new RestCollectorPlugin(clientParams);
+			case file:
+				return new FileCollectorPlugin(fileSystem);
+			case fileGzip:
+				return new FileGZipCollectorPlugin(fileSystem);
 			case other:
 				final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
 					.ofNullable(api.getParams().get("other_plugin_type"))
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java
@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
 public interface CollectorPlugin {

 	enum NAME {
-		oai, other, rest_json2xml;
+		oai, other, rest_json2xml, file, fileGzip;

 		public enum OTHER_NAME {
 			mdstore_mongodb_dump, mdstore_mongodb
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java
@ -0,0 +1,80 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Optional;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
+
+	private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
+
+	public static final String SPLIT_ON_ELEMENT = "splitOnElement";
+
+	private final FileSystem fileSystem;
+
+	public AbstractSplittedRecordPlugin(FileSystem fileSystem) {
+		this.fileSystem = fileSystem;
+	}
+
+	@Override
+	public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
+
+		// get path to file
+		final Path filePath = Optional
+			.ofNullable(api.getBaseUrl())
+			.map(Path::new)
+			.orElseThrow(() -> new CollectorException("missing baseUrl"));
+
+		log.info("baseUrl: {}", filePath);
+
+		// check that path to file exists
+		try {
+			if (!fileSystem.exists(filePath)) {
+				throw new CollectorException("path does not exist: " + filePath);
+			}
+		} catch (IOException e) {
+			throw new CollectorException(e);
+		}
+
+		// get split element
+		final String splitOnElement = Optional
+			.ofNullable(api.getParams().get(SPLIT_ON_ELEMENT))
+			.orElseThrow(
+				() -> new CollectorException(String
+					.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT)));
+
+		log.info("splitOnElement: {}", splitOnElement);
+
+		final BufferedInputStream bis = getBufferedInputStream(filePath);
+
+		Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis);
+
+		return StreamSupport
+			.stream(
+				Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED),
+				false);
+	}
+
+	abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException;
+
+	public FileSystem getFileSystem() {
+		return fileSystem;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java
@ -0,0 +1,33 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.BufferedInputStream;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
+
+	private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
+
+	public FileCollectorPlugin(FileSystem fileSystem) {
+		super(fileSystem);
+	}
+
+	@Override
+	protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
+
+		log.info("filePath: {}", filePath);
+
+		try {
+			FileSystem fs = super.getFileSystem();
+			return new BufferedInputStream(fs.open(filePath));
+		} catch (Exception e) {
+			throw new CollectorException("Error reading file " + filePath, e);
+		}
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java
@ -0,0 +1,35 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.BufferedInputStream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
+
+	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class);
+
+	public FileGZipCollectorPlugin(FileSystem fileSystem) {
+		super(fileSystem);
+	}
+
+	@Override
+	protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
+
+		log.info("filePath: {}", filePath);
+
+		try {
+			FileSystem fs = super.getFileSystem();
+			GZIPInputStream stream = new GZIPInputStream(fs.open(filePath));
+			return new BufferedInputStream(stream);
+		} catch (Exception e) {
+			throw new CollectorException("Error reading file " + filePath, e);
+		}
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java
@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import eu.dnetlib.dhp.collection.XmlCleaner;
+import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpConnector2;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -30,7 +30,7 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;

-import eu.dnetlib.dhp.collection.JsonUtils;
+import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.collection;
+package eu.dnetlib.dhp.collection.plugin.utils;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -0,0 +1,177 @@
+
+package eu.dnetlib.dhp.collection.plugin.utils;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringWriter;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.Iterator;
+
+import javax.xml.stream.XMLEventFactory;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLEventWriter;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLOutputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.events.StartElement;
+import javax.xml.stream.events.XMLEvent;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class XMLIterator implements Iterator<String> {
+
+	private static final Log log = LogFactory.getLog(XMLIterator.class);
+
+	private ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
+
+		@Override
+		protected XMLInputFactory initialValue() {
+			return XMLInputFactory.newInstance();
+		}
+	};
+
+	private ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
+
+		@Override
+		protected XMLOutputFactory initialValue() {
+			return XMLOutputFactory.newInstance();
+		}
+	};
+
+	private ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
+
+		@Override
+		protected XMLEventFactory initialValue() {
+			return XMLEventFactory.newInstance();
+		}
+	};
+
+	public static final String UTF_8 = "UTF-8";
+
+	final XMLEventReader parser;
+
+	private XMLEvent current = null;
+
+	private String element;
+
+	private InputStream inputStream;
+
+	public XMLIterator(final String element, final InputStream inputStream) {
+		super();
+		this.element = element;
+		this.inputStream = inputStream;
+		this.parser = getParser();
+		try {
+			this.current = findElement(parser);
+		} catch (XMLStreamException e) {
+			log.warn("cannot init parser position. No element found: " + element);
+			current = null;
+		}
+	}
+
+	@Override
+	public boolean hasNext() {
+		return current != null;
+	}
+
+	@Override
+	public String next() {
+		String result = null;
+		try {
+			result = copy(parser);
+			current = findElement(parser);
+			return result;
+		} catch (XMLStreamException e) {
+			throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e);
+		}
+	}
+
+	@Override
+	public void remove() {
+		throw new UnsupportedOperationException();
+	}
+
+	@SuppressWarnings("finally")
+	private String copy(final XMLEventReader parser) throws XMLStreamException {
+		final StringWriter result = new StringWriter();
+		try {
+			final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result);
+			final StartElement start = current.asStartElement();
+			final StartElement newRecord = eventFactory
+				.get()
+				.createStartElement(start.getName(), start.getAttributes(), start.getNamespaces());
+
+			// new root record
+			writer.add(newRecord);
+
+			// copy the rest as it is
+			while (parser.hasNext()) {
+				final XMLEvent event = parser.nextEvent();
+
+				// TODO: replace with depth tracking instead of close tag tracking.
+				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+					writer.add(event);
+					break;
+				}
+
+				writer.add(event);
+			}
+			writer.close();
+		} finally {
+			return result.toString();
+		}
+	}
+
+	/**
+	 * Looks for the next occurrence of the splitter element.
+	 *
+	 * @param parser
+	 * @return
+	 * @throws XMLStreamException
+	 */
+	private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException {
+
+		/*
+		 * if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; }
+		 */
+
+		XMLEvent peek = parser.peek();
+		if (peek != null && peek.isStartElement()) {
+			String name = peek.asStartElement().getName().getLocalPart();
+			if (element.equals(name)) {
+				return peek;
+			}
+		}
+
+		while (parser.hasNext()) {
+			final XMLEvent event = parser.nextEvent();
+			if (event != null && event.isStartElement()) {
+				String name = event.asStartElement().getName().getLocalPart();
+				if (element.equals(name)) {
+					return event;
+				}
+			}
+		}
+		return null;
+	}
+
+	private XMLEventReader getParser() {
+		try {
+			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+		} catch (XMLStreamException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private Reader sanitize(final InputStream in) {
+		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
+		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
+		charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+		return new InputStreamReader(in, charsetDecoder);
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.collection;
+package eu.dnetlib.dhp.collection.plugin.utils;

 import java.util.HashMap;
 import java.util.HashSet;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
@ -17,6 +17,9 @@ public class PMArticle implements Serializable {
 	 * the Pubmed Identifier
 	 */
 	private String pmid;
+
+	private String pmcId;
+
 	/**
 	 * the DOI
 	 */
@ -122,7 +125,7 @@ public class PMArticle implements Serializable {

 	/**
 	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
-		 * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
+	 * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
 	 * The NLM journal title abbreviation is exported in the <MedlineTA> element.
 	 *
 	 * @return the pubmed Journal Extracted
@ -140,10 +143,11 @@ public class PMArticle implements Serializable {
 	}

 	/**
-	 * English-language abstracts are taken directly from the published article.
-	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
-	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
-	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
+	 * <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
+	 * those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
+	 * All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
+	 * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
+	 * Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
 	 *
 	 *  @return the extracted pubmed Title
 	 */
@ -250,4 +254,13 @@ public class PMArticle implements Serializable {
 	public List<PMGrant> getGrants() {
 		return grants;
 	}
+
+	public String getPmcId() {
+		return pmcId;
+	}
+
+	public PMArticle setPmcId(String pmcId) {
+		this.pmcId = pmcId;
+		return this;
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml
@ -86,7 +86,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Produces the unresolved from bip finder!</name>
+            <name>Produces the unresolved from BIP! Finder</name>
            <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
@ -135,7 +135,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Produces the unresolved from FOS!</name>
+            <name>Produces the unresolved from FOS</name>
            <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
@ -185,7 +185,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Produces the unresolved from FOS!</name>
+            <name>Produces the unresolved from FOS</name>
            <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json
@ -0,0 +1,37 @@
+[
+  {
+    "paramName": "wp",
+    "paramLongName": "workingPath",
+    "paramDescription": "the zipped opencitations file",
+    "paramRequired": true
+  },
+
+
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },
+  {
+    "paramName": "d",
+    "paramLongName": "delimiter",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": true
+  },
+  {
+    "paramName": "if",
+    "paramLongName": "inputFile",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": true
+  }
+]
+
+
+
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
@ -26,6 +26,7 @@
        <switch>
            <case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
            <case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
+            <case to="read">${wf:conf('resumeFrom') eq 'ReadContent'}</case>
            <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
        </switch>
    </decision>
@ -60,6 +61,32 @@
            <arg>--inputFile</arg><arg>${inputFile}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
        </java>
+        <ok to="read"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="read">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the AS for OC</name>
+            <class>eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
+            <arg>--outputPath</arg><arg>${workingPath}/COCI_JSON/</arg>
+            <arg>--delimiter</arg><arg>${delimiter}</arg>
+            <arg>--inputFile</arg><arg>${inputFileCoci}</arg>
+        </spark>
        <ok to="create_actionset"/>
        <error to="Kill"/>
    </action>
@ -81,7 +108,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
+            <arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/extract_fromzip_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/extract_fromzip_parameters.json
@ -0,0 +1,23 @@
+[
+
+{
+"paramName": "ip",
+"paramLongName": "inputPath",
+"paramDescription": "the path where the projects are stored ",
+"paramRequired": true
+},
+
+
+  {
+"paramName": "op",
+"paramLongName": "outputPath",
+"paramDescription": "the path for the extracted folder",
+"paramRequired": true
+},
+  {
+    "paramName": "hnn",
+    "paramLongName": "hdfsNameNode",
+    "paramDescription": "the hdfs namenode",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/download.sh
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/download.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+hdfs dfs -rm $2
+curl -LSs $1 |  hdfs dfs -put - $2
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml
@ -1,27 +1,9 @@
 <workflow-app name="H2020Classification" xmlns="uri:oozie:workflow:0.5">
    <parameters>
-        <property>
-            <name>projectFileURL</name>
-            <description>the url where to get the projects file</description>
-        </property>
-
-        <property>
-            <name>programmeFileURL</name>
-            <description>the url where to get the programme file</description>
-        </property>
-
-        <property>
-            <name>topicFileURL</name>
-            <description>the url where to get the topic file</description>
-        </property>
        <property>
            <name>outputPath</name>
            <description>path where to store the action set</description>
        </property>
-        <property>
-            <name>sheetName</name>
-            <description>the name of the sheet to read</description>
-        </property>
    </parameters>

    <start to="deleteoutputpath"/>
@ -35,40 +17,103 @@
            <delete path='${workingDir}'/>
            <mkdir path='${workingDir}'/>
        </fs>
-        <ok to="fork_get_info"/>
+        <ok to="fork_download_info"/>
        <error to="Kill"/>
    </action>

-
-    <fork name="fork_get_info">
+    <fork name="fork_download_info">
        <path start="fork_get_projects"/>
-        <path start="get_programme_file"/>
-        <path start="get_topic_file"/>
-
+        <path start="download_programme_file"/>
    </fork>

    <fork name="fork_get_projects">
-        <path start="get_project_file"/>
-        <path start="read_projects"/>
+        <path start="download_projects"/>
+        <path start="read_projects_from_db"/>
    </fork>

-    <action name="get_project_file">
-        <java>
-            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
-            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
-            <arg>--fileURL</arg><arg>${projectFileURL}</arg>
-            <arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
-            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject</arg>
-        </java>
-        <ok to="wait_projects"/>
+    <action name="download_projects">
+        <shell xmlns="uri:oozie:shell-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <configuration>
+                <property>
+                    <name>mapred.job.queue.name</name>
+                    <value>${queueName}</value>
+                </property>
+            </configuration>
+            <exec>download.sh</exec>
+            <argument>${downloadH2020Projects}</argument>
+            <argument>${projectPath}</argument>
+            <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
+            <file>download.sh</file>
+            <capture-output/>
+        </shell>
+        <ok to="extract_projects"/>
        <error to="Kill"/>
    </action>

-    <action name="get_programme_file">
+    <action name="extract_projects">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ExtractFromZip</main-class>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--inputPath</arg><arg>${projectPath}</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/</arg>
+        </java>
+        <ok to="read_from_folder"/>
+        <error to="Kill"/>
+    </action>
+
+    <fork name="read_from_folder">
+        <path start="read_projects"/>
+        <path start="read_topic_file"/>
+    </fork>
+
+    <action name="read_projects">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadProjects</main-class>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/json/project.json</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/projects</arg>
+        </java>
+        <ok to="wait_read_from_folder"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="download_programme_file">
+        <shell xmlns="uri:oozie:shell-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <configuration>
+                <property>
+                    <name>mapred.job.queue.name</name>
+                    <value>${queueName}</value>
+                </property>
+            </configuration>
+            <exec>download.sh</exec>
+            <argument>${downloadH2020Programme}</argument>
+            <argument>${programmePath}</argument>
+            <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
+            <file>download.sh</file>
+            <capture-output/>
+        </shell>
+        <ok to="extract_programme"/>
+        <error to="Kill"/>
+    </action>
+    <action name="extract_programme">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ExtractFromZip</main-class>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--inputPath</arg><arg>${programmePath}</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/downloadedProgramme/</arg>
+        </java>
+        <ok to="read_programme"/>
+        <error to="Kill"/>
+    </action>
+    <action name="read_programme">
        <java>
            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
-            <arg>--fileURL</arg><arg>${programmeFileURL}</arg>
+            <arg>--fileURL</arg><arg>${workingDir}/downloadedProgramme/csv/programme.csv</arg>
            <arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme</arg>
        </java>
@ -76,20 +121,18 @@
        <error to="Kill"/>
    </action>

-    <action name="get_topic_file">
+    <action name="read_topic_file">
        <java>
-            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadExcel</main-class>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadTopics</main-class>
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
-            <arg>--fileURL</arg><arg>${topicFileURL}</arg>
-            <arg>--hdfsPath</arg><arg>${workingDir}/topic</arg>
-            <arg>--sheetName</arg><arg>${sheetName}</arg>
-            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/json/topics.json</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/topic</arg>
        </java>
-        <ok to="wait"/>
+        <ok to="wait_read_from_folder"/>
        <error to="Kill"/>
    </action>

-    <action name="read_projects">
+    <action name="read_projects_from_db">
        <java>
            <main-class>eu.dnetlib.dhp.actionmanager.project.ReadProjectsFromDB</main-class>
            <arg>--hdfsPath</arg><arg>${workingDir}/dbProjects</arg>
@ -123,9 +166,11 @@
            <arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
        </spark>
        <ok to="wait"/>
+<!--        <ok to="End"/>-->
        <error to="Kill"/>
    </action>

+    <join name="wait_read_from_folder" to="wait_projects"/>
    <join name="wait" to="create_updates"/>

    <join name="wait_projects" to="prepare_project"/>
@ -153,6 +198,7 @@
            <arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
        </spark>
        <ok to="wait"/>
+<!--        <ok to="End"/>-->
        <error to="Kill"/>
    </action>

--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/read_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/read_parameters.json
@ -0,0 +1,23 @@
+[
+
+{
+"paramName": "ip",
+"paramLongName": "inputPath",
+"paramDescription": "the path where the projects are stored ",
+"paramRequired": true
+},
+
+
+  {
+"paramName": "op",
+"paramLongName": "outputPath",
+"paramDescription": "the path for the extracted folder",
+"paramRequired": true
+},
+  {
+    "paramName": "hnn",
+    "paramLongName": "hdfsNameNode",
+    "paramDescription": "the hdfs namenode",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json
@ -0,0 +1,32 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "hmu",
+    "paramLongName": "hive_metastore_uris",
+    "paramDescription": "the URI for the hive metastore",
+    "paramRequired": true
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  },
+  {
+    "paramName": "sdb",
+    "paramLongName": "usagestatsdb",
+    "paramDescription": "the name of the db to be used",
+    "paramRequired": true
+  },
+  {
+    "paramName": "wp",
+    "paramLongName": "workingPath",
+    "paramDescription": "the workingPath where to save the content of the usage_stats table",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/workflow.xml
@ -1,25 +1,12 @@
-<workflow-app name="sub_dump_community_products" xmlns="uri:oozie:workflow:0.5">
-
+<workflow-app name="UsageStatsCounts" xmlns="uri:oozie:workflow:0.5">
    <parameters>
-        <property>
-            <name>sourcePath</name>
-            <description>the source path</description>
-        </property>
        <property>
            <name>outputPath</name>
-            <description>the output path</description>
+            <description>the path where to store the actionset</description>
        </property>
        <property>
-            <name>hiveDbName</name>
-            <description>the target hive database name</description>
-        </property>
-        <property>
-            <name>hiveJdbcUrl</name>
-            <description>hive server jdbc url</description>
-        </property>
-        <property>
-            <name>hiveMetastoreUris</name>
-            <description>hive server metastore URIs</description>
+            <name>usagestatsdb</name>
+            <description>the name of the db to be used</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
@ -76,50 +63,19 @@

        </configuration>
    </global>
-
-    <start to="common_action_community_funder"/>
-
+    <start to="atomicactions"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

-    <action name="common_action_community_funder">
-        <sub-workflow>
-            <app-path>${wf:appPath()}/dump_common
-            </app-path>
-            <propagate-configuration/>
-            <configuration>
-                <property>
-                    <name>sourcePath</name>
-                    <value>${sourcePath}</value>
-                </property>
-                <property>
-                    <name>selectedResults</name>
-                    <value>${sourcePath}</value>
-                </property>
-                <property>
-                    <name>communityMapPath</name>
-                    <value>${workingDir}/communityMap</value>
-                </property>
-                <property>
-                    <name>outputPath</name>
-                    <value>${workingDir}</value>
-                </property>
-            </configuration>
-        </sub-workflow>
-        <ok to="splitForCommunities" />
-        <error to="Kill" />
-    </action>

-
-
-    <action name="splitForCommunities">
+    <action name="atomicactions">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Split dumped result for community</name>
-            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <name>Produces the atomic action with the usage stats count for results</name>
+            <class>eu.dnetlib.dhp.actionmanager.usagestats.SparkAtomicActionUsageJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
@ -130,16 +86,14 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
+            <arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
-            <arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
+            <arg>--usagestatsdb</arg><arg>${usagestatsdb}</arg>
+            <arg>--workingPath</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>

-
-
    <end name="End"/>
-
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/create_updated_hb_map.py
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/create_updated_hb_map.py
@ -0,0 +1,63 @@
+from urllib.request import urlopen
+import json
+
+
+def retrieve_datacite_clients(base_url):
+    datacite_clients = {}
+    while base_url is not None:
+        with urlopen(base_url) as response:
+            print(f"requesting {base_url}")
+            response_content = response.read()
+            data = json.loads(response_content)
+            if 'data' in data and len(data['data'])>0:
+                for item in data['data']:
+                    datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
+                base_url = data['links']['next']
+            else:
+                base_url = None
+    return datacite_clients
+
+
+def retrieve_r3data(start_url):
+    r3data_clients = {}
+    page_number = 1
+    base_url = start_url
+    while base_url is not None:
+        with urlopen(base_url) as response:
+            print(f"requesting {base_url}")
+            response_content = response.read()
+            data = json.loads(response_content)
+            if 'data' in data and len(data['data'])>0:
+                for item in data['data']:
+                    r3data_clients[item['id'].lower()]= dict(
+                        openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
+                    official_name=item['attributes']['repositoryName']
+                    )
+                page_number +=1
+                base_url = f"{start_url}&page[number]={page_number}"
+            else:
+                base_url = None
+    return r3data_clients
+
+
+
+
+
+
+base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
+
+dc = retrieve_datacite_clients(base_url)
+r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
+
+result = {}
+
+for item in dc:
+    res = dc[item].lower()
+    if res not in r3:
+        print(f"missing {res} for {item} in dictionary")
+    else:
+        result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
+
+
+with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
+    json.dump(result, json_file, ensure_ascii=False, indent=1)
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala
@ -7,16 +7,14 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}

 object CollectionUtils {

-  /**
-   * This method in pipeline to the transformation phase,
-   * generates relations in both verse, typically it should be a phase of flatMap
-   *
-   * @param i input OAF
-   * @return
-   * If the input OAF is an entity -> List(i)
-   * If the input OAF is a relation -> List(relation, inverseRelation)
-   *
-   */
+  /** This method in pipeline to the transformation phase,
+    * generates relations in both verse, typically it should be a phase of flatMap
+    *
+    * @param i input OAF
+    * @return
+    * If the input OAF is an entity -> List(i)
+    * If the input OAF is a relation -> List(relation, inverseRelation)
+    */

  def fixRelations(i: Oaf): List[Oaf] = {
    if (i.isInstanceOf[OafEntity])
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala
@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
 import org.apache.http.entity.StringEntity
 import org.apache.http.impl.client.HttpClientBuilder

-
 abstract class AbstractRestClient extends Iterator[String] {

  var buffer: List[String] = List()
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {

  var complete: Boolean = false

-
  def extractInfo(input: String): Unit

  protected def getBufferData(): Unit

-
  def doHTTPGETRequest(url: String): String = {
    val httpGet = new HttpGet(url)
    doHTTPRequest(httpGet)
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
    buffer.nonEmpty && current_index < buffer.size
  }

-
  override def next(): String = {
    val next_item: String = buffer(current_index)
    current_index = current_index + 1
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
    next_item
  }

-
  private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
-    val timeout = 60; // seconds
-    val config = RequestConfig.custom()
+    val timeout = 600; // seconds
+    val config = RequestConfig
+      .custom()
      .setConnectTimeout(timeout * 1000)
      .setConnectionRequestTimeout(timeout * 1000)
-      .setSocketTimeout(timeout * 1000).build()
+      .setSocketTimeout(timeout * 1000)
+      .build()
    val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
    try {
      var tries = 4
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
          println(s"get response with status${response.getStatusLine.getStatusCode}")
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
-          }
-          else
+          } else
            return IOUtils.toString(response.getEntity.getContent)
        } catch {
          case e: Throwable =>
@ -87,4 +83,4 @@ abstract class AbstractRestClient extends Iterator[String] {
  }

  getBufferData()
-}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.datacite
 import org.json4s.jackson.JsonMethods.{compact, parse, render}
 import org.json4s.{DefaultFormats, JValue}

-class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
+class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient {

  override def extractInfo(input: String): Unit = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -16,16 +16,18 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
    current_index = 0
  }

-  def get_url():String ={
-    val to = if (until> 0) s"$until" else "*"
+  def get_url(): String = {
+    val to = if (until > 0) s"$until" else "*"
    s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"

  }

  override def getBufferData(): Unit = {
    if (!complete) {
-      val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
+      val response =
+        if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
+        else doHTTPGETRequest(get_url())
      extractInfo(response)
    }
  }
-}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
@ -10,24 +10,38 @@ import java.util.Locale
 import java.util.regex.Pattern
 import scala.io.Source

-/**
- * This class represent the dataModel of the input Dataset of Datacite
- * @param doi THE DOI
- * @param timestamp timestamp of last update date
- * @param isActive the record is active or deleted
- * @param json the json native records
- */
+/** This class represent the dataModel of the input Dataset of Datacite
+  * @param doi THE DOI
+  * @param timestamp timestamp of last update date
+  * @param isActive the record is active or deleted
+  * @param json the json native records
+  */
 case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}

 /*
  The following class are utility class used for the mapping from
  json datacite to OAF Shema
 */
-case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
+case class RelatedIdentifierType(
+  relationType: String,
+  relatedIdentifier: String,
+  relatedIdentifierType: String
+) {}

-case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
+case class NameIdentifiersType(
+  nameIdentifierScheme: Option[String],
+  schemeUri: Option[String],
+  nameIdentifier: Option[String]
+) {}

-case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
+case class CreatorType(
+  nameType: Option[String],
+  nameIdentifiers: Option[List[NameIdentifiersType]],
+  name: Option[String],
+  familyName: Option[String],
+  givenName: Option[String],
+  affiliation: Option[List[String]]
+) {}

 case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}

@ -35,100 +49,210 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {

 case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}

-case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
+case class FundingReferenceType(
+  funderIdentifierType: Option[String],
+  awardTitle: Option[String],
+  awardUri: Option[String],
+  funderName: Option[String],
+  funderIdentifier: Option[String],
+  awardNumber: Option[String]
+) {}

 case class DateType(date: Option[String], dateType: Option[String]) {}

-case class OAFRelations(relation:String, inverse:String, relType:String)
+case class OAFRelations(relation: String, inverse: String, relType: String)

-
-class DataciteModelConstants extends Serializable {
-
-}
+class DataciteModelConstants extends Serializable {}

 object DataciteModelConstants {

-  val REL_TYPE_VALUE:String = "resultResult"
+  val REL_TYPE_VALUE: String = "resultResult"
  val DATE_RELATION_KEY = "RelationDate"
  val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
  val DOI_CLASS = "doi"
  val SUBJ_CLASS = "keywords"
  val DATACITE_NAME = "Datacite"
  val dataInfo: DataInfo = dataciteDataInfo("0.9")
-  val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)

-  val subRelTypeMapping: Map[String,OAFRelations] = Map(
-    ModelConstants.REFERENCES ->            OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
-    ModelConstants.IS_REFERENCED_BY ->      OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
+  val DATACITE_COLLECTED_FROM: KeyValue =
+    OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)

-    ModelConstants.IS_SUPPLEMENTED_BY ->    OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
-    ModelConstants.IS_SUPPLEMENT_TO ->      OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
-
-    ModelConstants.HAS_PART ->              OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
-    ModelConstants.IS_PART_OF ->            OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
-
-    ModelConstants.IS_VERSION_OF->          OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
-    ModelConstants.HAS_VERSION->            OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
-
-    ModelConstants.IS_IDENTICAL_TO ->       OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
-
-    ModelConstants.IS_CONTINUED_BY ->       OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
-    ModelConstants.CONTINUES ->             OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
-
-    ModelConstants.IS_NEW_VERSION_OF->      OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
-    ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
-
-    ModelConstants.IS_DOCUMENTED_BY ->      OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
-    ModelConstants.DOCUMENTS ->             OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
-
-    ModelConstants.IS_SOURCE_OF ->          OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
-    ModelConstants.IS_DERIVED_FROM ->       OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
-
-    ModelConstants.CITES ->                 OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
-    ModelConstants.IS_CITED_BY ->           OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
-
-    ModelConstants.IS_VARIANT_FORM_OF ->    OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
-    ModelConstants.IS_OBSOLETED_BY ->       OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
-
-    ModelConstants.REVIEWS ->               OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
-    ModelConstants.IS_REVIEWED_BY ->        OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
-
-    ModelConstants.DOCUMENTS ->             OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
-    ModelConstants.IS_DOCUMENTED_BY ->      OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
-
-    ModelConstants.COMPILES ->              OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
-    ModelConstants.IS_COMPILED_BY ->        OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
+  val subRelTypeMapping: Map[String, OAFRelations] = Map(
+    ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
+      ModelConstants.IS_SUPPLEMENTED_BY,
+      ModelConstants.IS_SUPPLEMENT_TO,
+      ModelConstants.SUPPLEMENT
+    ),
+    ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
+      ModelConstants.IS_SUPPLEMENT_TO,
+      ModelConstants.IS_SUPPLEMENTED_BY,
+      ModelConstants.SUPPLEMENT
+    ),
+    ModelConstants.HAS_PART -> OAFRelations(
+      ModelConstants.HAS_PART,
+      ModelConstants.IS_PART_OF,
+      ModelConstants.PART
+    ),
+    ModelConstants.IS_PART_OF -> OAFRelations(
+      ModelConstants.IS_PART_OF,
+      ModelConstants.HAS_PART,
+      ModelConstants.PART
+    ),
+    ModelConstants.IS_VERSION_OF -> OAFRelations(
+      ModelConstants.IS_VERSION_OF,
+      ModelConstants.HAS_VERSION,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.HAS_VERSION -> OAFRelations(
+      ModelConstants.HAS_VERSION,
+      ModelConstants.IS_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
+      ModelConstants.IS_IDENTICAL_TO,
+      ModelConstants.IS_IDENTICAL_TO,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_CONTINUED_BY -> OAFRelations(
+      ModelConstants.IS_CONTINUED_BY,
+      ModelConstants.CONTINUES,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.CONTINUES -> OAFRelations(
+      ModelConstants.CONTINUES,
+      ModelConstants.IS_CONTINUED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
+      ModelConstants.IS_NEW_VERSION_OF,
+      ModelConstants.IS_PREVIOUS_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
+      ModelConstants.IS_PREVIOUS_VERSION_OF,
+      ModelConstants.IS_NEW_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.DOCUMENTS,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.DOCUMENTS -> OAFRelations(
+      ModelConstants.DOCUMENTS,
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_SOURCE_OF -> OAFRelations(
+      ModelConstants.IS_SOURCE_OF,
+      ModelConstants.IS_DERIVED_FROM,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_DERIVED_FROM -> OAFRelations(
+      ModelConstants.IS_DERIVED_FROM,
+      ModelConstants.IS_SOURCE_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
+      ModelConstants.IS_VARIANT_FORM_OF,
+      ModelConstants.IS_DERIVED_FROM,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
+      ModelConstants.IS_OBSOLETED_BY,
+      ModelConstants.IS_NEW_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.REVIEWS -> OAFRelations(
+      ModelConstants.REVIEWS,
+      ModelConstants.IS_REVIEWED_BY,
+      ModelConstants.REVIEW
+    ),
+    ModelConstants.IS_REVIEWED_BY -> OAFRelations(
+      ModelConstants.IS_REVIEWED_BY,
+      ModelConstants.REVIEWS,
+      ModelConstants.REVIEW
+    ),
+    ModelConstants.DOCUMENTS -> OAFRelations(
+      ModelConstants.DOCUMENTS,
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.DOCUMENTS,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.COMPILES -> OAFRelations(
+      ModelConstants.COMPILES,
+      ModelConstants.IS_COMPILED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_COMPILED_BY -> OAFRelations(
+      ModelConstants.IS_COMPILED_BY,
+      ModelConstants.COMPILES,
+      ModelConstants.RELATIONSHIP
+    )
  )

-
  val datacite_filter: List[String] = {
    val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
-    require(stream!= null)
+    require(stream != null)
    Source.fromInputStream(stream).getLines().toList
  }

+  def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
+    false,
+    null,
+    false,
+    false,
+    ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+    trust
+  )

-  def dataciteDataInfo(trust: String): DataInfo =  OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
+  val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
+    "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
+    Locale.ENGLISH
+  )

-  val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
-  val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
+  val df_it: DateTimeFormatter =
+    DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)

  val funder_regex: List[(Pattern, String)] = List(
-    (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
-    (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
-
+    (
+      Pattern.compile(
+        "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
+        Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+      ),
+      "40|corda__h2020::"
+    ),
+    (
+      Pattern.compile(
+        "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
+        Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+      ),
+      "40|corda_______::"
+    )
  )

  val Date_regex: List[Pattern] = List(
    //Y-M-D
-    Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
+    Pattern.compile(
+      "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
+      Pattern.MULTILINE
+    ),
    //M-D-Y
-    Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
+    Pattern.compile(
+      "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
+      Pattern.MULTILINE
+    ),
    //D-M-Y
-    Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
+    Pattern.compile(
+      "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
+      Pattern.MULTILINE
+    ),
    //Y
    Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
  )

-
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
@ -19,22 +19,46 @@ import java.time.chrono.ThaiBuddhistDate
 import java.time.format.DateTimeFormatter
 import java.util.{Date, Locale}
 import scala.collection.JavaConverters._
-
+import scala.io.Source

 object DataciteToOAFTransformation {

+  case class HostedByMapType(
+    openaire_id: String,
+    datacite_name: String,
+    official_name: String,
+    similarity: Option[Float]
+  ) {}
+
  val mapper = new ObjectMapper()

+  val unknown_repository: HostedByMapType = HostedByMapType(
+    ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID,
+    ModelConstants.UNKNOWN_REPOSITORY.getValue,
+    ModelConstants.UNKNOWN_REPOSITORY.getValue,
+    Some(1.0f)
+  )
+
+  val hostedByMap: Map[String, HostedByMapType] = {
+    val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: org.json4s.JValue = parse(s)
+    json.extract[Map[String, HostedByMapType]]
+  }
+
+  /** This method should skip record if json contains invalid text
+    * defined in file datacite_filter
+    *
+    * @param record : not parsed Datacite record
+    * @param json : parsed record
+    * @return True if the record should be skipped
+    */
+  def skip_record(record: String, json: org.json4s.JValue): Boolean = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher")
+      .extractOrElse[String]("")
+      .equalsIgnoreCase("FAIRsharing")

-  /**
-   * This method should skip record if json contains invalid text
-   * defined in gile datacite_filter
-   *
-   * @param json
-   * @return True if the record should be skipped
-   */
-  def skip_record(json: String): Boolean = {
-    datacite_filter.exists(f => json.contains(f))
  }

  @deprecated("this method will be removed", "dhp")
@ -74,35 +98,39 @@ object DataciteToOAFTransformation {

  }

-
+  /** This utility method indicates whether the embargo date has been reached
+    * @param embargo_end_date
+    * @return True if the embargo date has been reached, false otherwise
+    */
  def embargo_end(embargo_end_date: String): Boolean = {
    val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
    val td = LocalDate.now()
    td.isAfter(dt)
  }

-
  def extract_date(input: String): Option[String] = {
-    val d = Date_regex.map(pattern => {
-      val matcher = pattern.matcher(input)
-      if (matcher.find())
-        matcher.group(0)
-      else
-        null
-    }
-    ).find(s => s != null)
+    val d = Date_regex
+      .map(pattern => {
+        val matcher = pattern.matcher(input)
+        if (matcher.find())
+          matcher.group(0)
+        else
+          null
+      })
+      .find(s => s != null)

    if (d.isDefined) {
      val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
      try {
        return Some(LocalDate.parse(a_date, df_en).toString)
      } catch {
-        case _: Throwable => try {
-          return Some(LocalDate.parse(a_date, df_it).toString)
-        } catch {
-          case _: Throwable =>
-            return None
-        }
+        case _: Throwable =>
+          try {
+            return Some(LocalDate.parse(a_date, df_it).toString)
+          } catch {
+            case _: Throwable =>
+              return None
+          }
      }
    }
    d
@ -118,31 +146,78 @@ object DataciteToOAFTransformation {
    }
  }

-
-  def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
+  /** *
+    * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
+    * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
+    * to generate one of the following main entities:
+    *  - publication
+    *  - dataset
+    *  - software
+    *  - otherresearchproduct
+    *
+    * @param resourceType
+    * @param resourceTypeGeneral
+    * @param schemaOrg
+    * @param vocabularies
+    * @return
+    */
+  def getTypeQualifier(
+    resourceType: String,
+    resourceTypeGeneral: String,
+    schemaOrg: String,
+    vocabularies: VocabularyGroup
+  ): (Qualifier, Qualifier) = {
    if (resourceType != null && resourceType.nonEmpty) {
-      val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
+      val typeQualifier =
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
      if (typeQualifier != null)
-        return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
+        return (
+          typeQualifier,
+          vocabularies.getSynonymAsQualifier(
+            ModelConstants.DNET_RESULT_TYPOLOGIES,
+            typeQualifier.getClassid
+          )
+        )
    }
    if (schemaOrg != null && schemaOrg.nonEmpty) {
-      val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
+      val typeQualifier =
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
      if (typeQualifier != null)
-        return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
+        return (
+          typeQualifier,
+          vocabularies.getSynonymAsQualifier(
+            ModelConstants.DNET_RESULT_TYPOLOGIES,
+            typeQualifier.getClassid
+          )
+        )

    }
    if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
-      val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
+      val typeQualifier = vocabularies.getSynonymAsQualifier(
+        ModelConstants.DNET_PUBLICATION_RESOURCE,
+        resourceTypeGeneral
+      )
      if (typeQualifier != null)
-        return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
+        return (
+          typeQualifier,
+          vocabularies.getSynonymAsQualifier(
+            ModelConstants.DNET_RESULT_TYPOLOGIES,
+            typeQualifier.getClassid
+          )
+        )

    }
    null
  }

-
-  def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
-    val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+  def getResult(
+    resourceType: String,
+    resourceTypeGeneral: String,
+    schemaOrg: String,
+    vocabularies: VocabularyGroup
+  ): Result = {
+    val typeQualifiers: (Qualifier, Qualifier) =
+      getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
    if (typeQualifiers == null)
      return null
    val i = new Instance
@ -168,13 +243,12 @@ object DataciteToOAFTransformation {
    null
  }

-
  def available_date(input: String): Boolean = {

    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: org.json4s.JValue = parse(input)
    val l: List[String] = for {
-      JObject(dates) <- json \\ "dates"
+      JObject(dates)                         <- json \\ "dates"
      JField("dateType", JString(dateTypes)) <- dates
    } yield dateTypes

@ -182,29 +256,28 @@ object DataciteToOAFTransformation {

  }

-
-  /**
-   * As describe in ticket #6377
-   * when the result come from figshare we need to remove subject
-   * and set Access rights OPEN.
-   *
-   * @param r
-   */
+  /** As describe in ticket #6377
+    * when the result come from figshare we need to remove subject
+    * and set Access rights OPEN.
+    *
+    * @param r
+    */
  def fix_figshare(r: Result): Unit = {

    if (r.getInstance() != null) {
-      val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
+      val hosted_by_figshare = r
+        .getInstance()
+        .asScala
+        .exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
      if (hosted_by_figshare) {
        r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
-        val l: List[StructuredProperty] = List()
+        val l: List[Subject] = List()
        r.setSubject(l.asJava)
      }
    }

-
  }

-
  def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
    val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
    s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
@ -214,7 +287,13 @@ object DataciteToOAFTransformation {
    OafMapperUtils.structuredProperty(dt, q, null)
  }

-  def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
+  def generateRelation(
+    sourceId: String,
+    targetId: String,
+    relClass: String,
+    cf: KeyValue,
+    di: DataInfo
+  ): Relation = {

    val r = new Relation
    r.setSource(sourceId)
@ -226,7 +305,6 @@ object DataciteToOAFTransformation {
    r.setDataInfo(di)
    r

-
  }

  def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
@ -238,22 +316,28 @@ object DataciteToOAFTransformation {
      val grantId = m.matcher(awardUri).replaceAll("$2")
      val targetId = s"$p${DHPUtils.md5(grantId)}"
      List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
-    }
-    else
+    } else
      List()

  }

-
-  def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
-    if (skip_record(input))
-      return List()
+  def generateOAF(
+    input: String,
+    ts: Long,
+    dateOfCollection: Long,
+    vocabularies: VocabularyGroup,
+    exportLinks: Boolean
+  ): List[Oaf] = {

    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json = parse(input)

+    if (skip_record(input, json))
+      return List()
+
    val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
-    val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
+    val resourceTypeGeneral =
+      (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
    val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)

    val doi = (json \ "attributes" \ "doi").extract[String]
@ -265,63 +349,92 @@ object DataciteToOAFTransformation {
    if (result == null)
      return List()

-
-    val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
+    // DOI is mapped on a PID inside a Instance object
+    val doi_q = OafMapperUtils.qualifier(
+      "doi",
+      "doi",
+      ModelConstants.DNET_PID_TYPES,
+      ModelConstants.DNET_PID_TYPES
+    )
    val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
    result.setPid(List(pid).asJava)
+
+    // This identifiere will be replaced in a second moment using the PID logic generation
    result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
    result.setOriginalId(List(doi).asJava)

    val d = new Date(dateOfCollection * 1000)
    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)

-
    result.setDateofcollection(ISO8601FORMAT.format(d))
    result.setDateoftransformation(ISO8601FORMAT.format(d))
    result.setDataInfo(dataInfo)

    val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())

-
    val authors = creators.zipWithIndex.map { case (c, idx) =>
      val a = new Author
      a.setFullname(c.name.orNull)
      a.setName(c.givenName.orNull)
      a.setSurname(c.familyName.orNull)
      if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
-        a.setPid(c.nameIdentifiers.get.map(ni => {
-          val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
-          if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
-            OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
-          }
-          else
-            null
+        a.setPid(
+          c.nameIdentifiers.get
+            .map(ni => {
+              val q =
+                if (ni.nameIdentifierScheme.isDefined)
+                  vocabularies.getTermAsQualifier(
+                    ModelConstants.DNET_PID_TYPES,
+                    ni.nameIdentifierScheme.get.toLowerCase()
+                  )
+                else null
+              if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
+                OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
+              } else
+                null

-        }
+            })
+            .asJava
        )
-          .asJava)
      }
      if (c.affiliation.isDefined)
-        a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
+        a.setAffiliation(
+          c.affiliation.get
+            .filter(af => af.nonEmpty)
+            .map(af => OafMapperUtils.field(af, dataInfo))
+            .asJava
+        )
      a.setRank(idx + 1)
      a
    }

-
-    val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
-
-    result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
-      if (t.titleType.isEmpty) {
-        OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
-      } else {
-        OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
-      }
-    }).asJava)
-
    if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
      return List()
    result.setAuthor(authors.asJava)

+    val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
+
+    result.setTitle(
+      titles
+        .filter(t => t.title.nonEmpty)
+        .map(t => {
+          if (t.titleType.isEmpty) {
+            OafMapperUtils
+              .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
+          } else {
+            OafMapperUtils.structuredProperty(
+              t.title.get,
+              t.titleType.get,
+              t.titleType.get,
+              ModelConstants.DNET_DATACITE_TITLE,
+              ModelConstants.DNET_DATACITE_TITLE,
+              null
+            )
+          }
+        })
+        .asJava
+    )
+
    val dates = (json \\ "dates").extract[List[DateType]]
    val publication_year = (json \\ "publicationYear").extractOrElse[String](null)

@ -337,46 +450,81 @@ object DataciteToOAFTransformation {

    if (a_date.isDefined) {
      if (doi.startsWith("10.14457"))
-        result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
+        result.setEmbargoenddate(
+          OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
+        )
      else
        result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
    }
    if (i_date.isDefined && i_date.get.isDefined) {
      if (doi.startsWith("10.14457")) {
-        result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
-        result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
-      }
-      else {
+        result.setDateofacceptance(
+          OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
+        )
+        result
+          .getInstance()
+          .get(0)
+          .setDateofacceptance(
+            OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
+          )
+      } else {
        result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
        result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
      }
-    }
-    else if (publication_year != null) {
+    } else if (publication_year != null) {
      if (doi.startsWith("10.14457")) {
-        result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
-        result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
+        result.setDateofacceptance(
+          OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
+        )
+        result
+          .getInstance()
+          .get(0)
+          .setDateofacceptance(
+            OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
+          )

      } else {
        result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
-        result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+        result
+          .getInstance()
+          .get(0)
+          .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
      }
    }

-
-    result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
-      .map(d => (extract_date(d.date.get), d.dateType.get))
-      .filter(d => d._1.isDefined)
-      .map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
-      .filter(d => d._2 != null)
-      .map(d => generateOAFDate(d._1, d._2)).asJava)
+    result.setRelevantdate(
+      dates
+        .filter(d => d.date.isDefined && d.dateType.isDefined)
+        .map(d => (extract_date(d.date.get), d.dateType.get))
+        .filter(d => d._1.isDefined)
+        .map(d =>
+          (
+            d._1.get,
+            vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
+          )
+        )
+        .filter(d => d._2 != null)
+        .map(d => generateOAFDate(d._1, d._2))
+        .asJava
+    )

    val subjects = (json \\ "subjects").extract[List[SubjectType]]

-    result.setSubject(subjects.filter(s => s.subject.nonEmpty)
-      .map(s =>
-        OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
-      ).asJava)
-
+    result.setSubject(
+      subjects
+        .filter(s => s.subject.nonEmpty)
+        .map(s =>
+          OafMapperUtils.subject(
+            s.subject.get,
+            SUBJ_CLASS,
+            SUBJ_CLASS,
+            ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+            ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+            null
+          )
+        )
+        .asJava
+    )

    result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)

@ -384,66 +532,88 @@ object DataciteToOAFTransformation {

    result.setDescription(
      descriptions
-        .filter(d => d.description.isDefined).
-        map(d =>
-          OafMapperUtils.field(d.description.get, null)
-        ).filter(s => s != null).asJava)
-
+        .filter(d => d.description.isDefined)
+        .map(d => OafMapperUtils.field(d.description.get, null))
+        .filter(s => s != null)
+        .asJava
+    )

    val publisher = (json \\ "publisher").extractOrElse[String](null)
    if (publisher != null)
      result.setPublisher(OafMapperUtils.field(publisher, null))

-
    val language: String = (json \\ "language").extractOrElse[String](null)

    if (language != null)
-      result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
-
+      result.setLanguage(
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
+      )

    val instance = result.getInstance().get(0)

    val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]

    val accessRights: List[String] = for {
-      JObject(rightsList) <- json \\ "rightsList"
+      JObject(rightsList)                     <- json \\ "rightsList"
      JField("rightsUri", JString(rightsUri)) <- rightsList
    } yield rightsUri

-    val aRights: Option[AccessRight] = accessRights.map(r => {
-      vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
-    }).find(q => q != null).map(q => {
-      val a = new AccessRight
-      a.setClassid(q.getClassid)
-      a.setClassname(q.getClassname)
-      a.setSchemeid(q.getSchemeid)
-      a.setSchemename(q.getSchemename)
-      a
-    })
+    val aRights: Option[AccessRight] = accessRights
+      .map(r => {
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
+      })
+      .find(q => q != null)
+      .map(q => {
+        val a = new AccessRight
+        a.setClassid(q.getClassid)
+        a.setClassname(q.getClassname)
+        a.setSchemeid(q.getSchemeid)
+        a.setSchemename(q.getSchemename)
+        a
+      })

-
-    val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+    val access_rights_qualifier =
+      if (aRights.isDefined) aRights.get
+      else
+        OafMapperUtils.accessRight(
+          ModelConstants.UNKNOWN,
+          ModelConstants.NOT_AVAILABLE,
+          ModelConstants.DNET_ACCESS_MODES,
+          ModelConstants.DNET_ACCESS_MODES
+        )

    if (client.isDefined) {

-      instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
+      val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
+      instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
+
      instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
      instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
      instance.setAccessright(access_rights_qualifier)
      instance.setPid(result.getPid)
      val license = accessRights
-        .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
+        .find(r =>
+          r.startsWith("http") && r.matches(
+            ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
+          )
+        )
      if (license.isDefined)
        instance.setLicense(OafMapperUtils.field(license.get, null))
    }

    val awardUris: List[String] = for {
-      JObject(fundingReferences) <- json \\ "fundingReferences"
+      JObject(fundingReferences)            <- json \\ "fundingReferences"
      JField("awardUri", JString(awardUri)) <- fundingReferences
    } yield awardUri

+    val oid = result.getId
    result.setId(IdentifierFactory.createIdentifier(result))
-    var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
+    if (!result.getId.equalsIgnoreCase(oid)) {
+      result.setOriginalId((oid :: List(doi)).asJava)
+    }
+
+    var relations: List[Relation] =
+      awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)

    fix_figshare(result)

@ -452,51 +622,81 @@ object DataciteToOAFTransformation {

    if (exportLinks) {
      val rels: List[RelatedIdentifierType] = for {
-        JObject(relIdentifier) <- json \\ "relatedIdentifiers"
-        JField("relationType", JString(relationType)) <- relIdentifier
+        JObject(relIdentifier)                                          <- json \\ "relatedIdentifiers"
+        JField("relationType", JString(relationType))                   <- relIdentifier
        JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
-        JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
+        JField("relatedIdentifier", JString(relatedIdentifier))         <- relIdentifier
      } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)

-      relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
+      relations = relations ::: generateRelations(
+        rels,
+        result.getId,
+        if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
+      )
    }
    if (relations != null && relations.nonEmpty) {
      List(result) ::: relations
-    }
-    else
+    } else
      List(result)
  }

-  private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
-    rels
+  private def generateRelations(
+    rels: List[RelatedIdentifierType],
+    id: String,
+    date: String
+  ): List[Relation] = {
+    val bidirectionalRels: List[Relation] = rels
      .filter(r =>
-        subRelTypeMapping.contains(r.relationType) && (
-          r.relatedIdentifierType.equalsIgnoreCase("doi") ||
-            r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
-            r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
+        subRelTypeMapping
+          .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
+        r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
+        r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
      )
      .map(r => {
-        val rel = new Relation
-        rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
-        rel.setDataInfo(dataInfo)
-
        val subRelType = subRelTypeMapping(r.relationType).relType
-        rel.setRelType(REL_TYPE_VALUE)
-        rel.setSubRelType(subRelType)
-        rel.setRelClass(r.relationType)
-
-        val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
-
-        rel.setProperties(List(dateProps).asJava)
-
-        rel.setSource(id)
-        rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
-        rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
-        rel.getCollectedfrom.asScala.map(c => c.getValue).toList
-        rel
+        val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
+        relation(id, target, subRelType, r.relationType, date)
      })
+    val citationRels: List[Relation] = rels
+      .filter(r =>
+        (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
+        r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
+        r.relatedIdentifierType.equalsIgnoreCase("arxiv")) &&
+        (r.relationType.toLowerCase.contains("cite") || r.relationType.toLowerCase.contains("reference"))
+      )
+      .map(r => {
+        r.relationType match {
+          case ModelConstants.CITES | ModelConstants.REFERENCES =>
+            val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
+            relation(id, target, ModelConstants.CITATION, ModelConstants.CITES, date)
+          case ModelConstants.IS_CITED_BY | ModelConstants.IS_REFERENCED_BY =>
+            val source = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
+            relation(source, id, ModelConstants.CITATION, ModelConstants.CITES, date)
+        }
+      })
+
+    citationRels ::: bidirectionalRels
  }

+  def relation(source: String, target: String, subRelType: String, relClass: String, date: String): Relation = {
+    val rel = new Relation
+    rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
+    rel.setDataInfo(dataInfo)
+
+    rel.setRelType(REL_TYPE_VALUE)
+    rel.setSubRelType(subRelType)
+    rel.setRelClass(relClass)
+
+    val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+
+    rel.setProperties(List(dateProps).asJava)
+
+    rel.setSource(source)
+    rel.setTarget(target)
+    rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
+    rel.getCollectedfrom.asScala.map(c => c.getValue).toList
+    rel
+  }

  def generateDSId(input: String): String = {
    val b = StringUtils.substringBefore(input, "::")
@ -504,5 +704,4 @@ object DataciteToOAFTransformation {
    s"10|$b::${DHPUtils.md5(a)}"
  }

-
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
@ -12,12 +12,12 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}

+class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {

-class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends  AbstractScalaApplication(propertyPath, args, log:Logger) {
-  /**
-   * Here all the spark applications runs this method
-   * where the whole logic of the spark node is defined
-   */
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
  override def run(): Unit = {

    val sourcePath = parser.get("sourcePath")
@ -46,49 +46,65 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
    reportTotalSize(targetPath, outputBasePath)
  }

-
-  /**
-   * For working with MDStore we need to store in a file on hdfs the size of
-   * the current dataset
-   * @param targetPath
-   * @param outputBasePath
-   */
-  def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
+  /** For working with MDStore we need to store in a file on hdfs the size of
+    * the current dataset
+    * @param targetPath
+    * @param outputBasePath
+    */
+  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
    val total_items = spark.read.text(targetPath).count()
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
+    writeHdfsFile(
+      spark.sparkContext.hadoopConfiguration,
+      s"$total_items",
+      outputBasePath + MDSTORE_SIZE_PATH
+    )
  }

-  /**
-   * Generate the transformed and cleaned OAF Dataset from the native one
-
-   * @param sourcePath  sourcePath of the native Dataset in format JSON/Datacite
-   * @param exportLinks If true it generates unresolved links
-   * @param vocabularies vocabularies for cleaning
-   * @param targetPath the targetPath of the result Dataset
-   */
-  def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
-    require(spark!= null)
+  /** Generate the transformed and cleaned OAF Dataset from the native one
+    *
+    * @param sourcePath  sourcePath of the native Dataset in format JSON/Datacite
+    * @param exportLinks If true it generates unresolved links
+    * @param vocabularies vocabularies for cleaning
+    * @param targetPath the targetPath of the result Dataset
+    */
+  def generateDataciteDataset(
+    sourcePath: String,
+    exportLinks: Boolean,
+    vocabularies: VocabularyGroup,
+    targetPath: String,
+    spark: SparkSession
+  ): Unit = {
+    require(spark != null)
    import spark.implicits._

    implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]

    implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    CollectionUtils.saveDataset(
-      spark.read.load(sourcePath).as[DataciteType]
+      spark.read
+        .load(sourcePath)
+        .as[DataciteType]
        .filter(d => d.isActive)
-        .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
+        .flatMap(d =>
+          DataciteToOAFTransformation
+            .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
+        )
        .filter(d => d != null),
-      targetPath)
+      targetPath
+    )
  }

 }

-
 object GenerateDataciteDatasetSpark {

  val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)

  def main(args: Array[String]): Unit = {
-    new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
+    new GenerateDataciteDatasetSpark(
+      "/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
+      args,
+      log
+    ).initialize().run()
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala
@ -22,7 +22,6 @@ object ImportDatacite {

  val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)

-
  def convertAPIStringToDataciteItem(input: String): DataciteType = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: org.json4s.JValue = parse(input)
@ -32,14 +31,26 @@ object ImportDatacite {

    val timestamp_string = (json \ "attributes" \ "updated").extract[String]
    val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
-    DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
+    DataciteType(
+      doi = doi,
+      timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
+      isActive = isActive,
+      json = input
+    )

  }

-
  def main(args: Array[String]): Unit = {

-    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
+    val parser = new ArgumentApplicationParser(
+      Source
+        .fromInputStream(
+          getClass.getResourceAsStream(
+            "/eu/dnetlib/dhp/datacite/import_from_api.json"
+          )
+        )
+        .mkString
+    )
    parser.parseArgument(args)
    val master = parser.get("master")

@ -60,7 +71,8 @@ object ImportDatacite {
    val spkipImport = parser.get("skipImport")
    log.info(s"skipImport is $spkipImport")

-    val spark: SparkSession = SparkSession.builder()
+    val spark: SparkSession = SparkSession
+      .builder()
      .appName(ImportDatacite.getClass.getSimpleName)
      .master(master)
      .getOrCreate()
@ -78,45 +90,48 @@ object ImportDatacite {

    import spark.implicits._

+    val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
+      new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {

-    val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
+        override def zero: DataciteType = null

-      override def zero: DataciteType = null
-
-      override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
-        if (b == null)
-          return a
-        if (a == null)
-          return b
-        if (a.timestamp > b.timestamp) {
-          return a
+        override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
+          if (b == null)
+            return a
+          if (a == null)
+            return b
+          if (a.timestamp > b.timestamp) {
+            return a
+          }
+          b
        }
-        b
+
+        override def merge(a: DataciteType, b: DataciteType): DataciteType = {
+          reduce(a, b)
+        }
+
+        override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
+
+        override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
+
+        override def finish(reduction: DataciteType): DataciteType = reduction
      }

-      override def merge(a: DataciteType, b: DataciteType): DataciteType = {
-        reduce(a, b)
-      }
-
-      override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
-
-      override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
-
-      override def finish(reduction: DataciteType): DataciteType = reduction
-    }
-
    val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
    val ts = dump.select(max("timestamp")).first().getLong(0)

    println(s"last Timestamp is $ts")

-    val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
+    val cnt =
+      if ("true".equalsIgnoreCase(spkipImport)) 1
+      else writeSequenceFile(hdfsTargetPath, ts, conf, bs)

    println(s"Imported from Datacite API $cnt documents")

    if (cnt > 0) {

-      val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
+      val inputRdd: RDD[DataciteType] = sc
+        .sequenceFile(targetPath, classOf[Int], classOf[Text])
        .map(s => s._2.toString)
        .map(s => convertAPIStringToDataciteItem(s))
      spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
@ -129,7 +144,9 @@ object ImportDatacite {
        .agg(dataciteAggregator.toColumn)
        .map(s => s._2)
        .repartition(4000)
-        .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
+        .write
+        .mode(SaveMode.Overwrite)
+        .save(s"${dataciteDump}_updated")

      val fs = FileSystem.get(sc.hadoopConfiguration)
      fs.delete(new Path(s"$dataciteDump"), true)
@ -137,14 +154,24 @@ object ImportDatacite {
    }
  }

-  private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
+  private def writeSequenceFile(
+    hdfsTargetPath: Path,
+    timestamp: Long,
+    conf: Configuration,
+    bs: Int
+  ): Long = {
    var from: Long = timestamp * 1000
    val delta: Long = 100000000L
    var client: DataciteAPIImporter = null
    val now: Long = System.currentTimeMillis()
    var i = 0
    try {
-      val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
+      val writer = SequenceFile.createWriter(
+        conf,
+        SequenceFile.Writer.file(hdfsTargetPath),
+        SequenceFile.Writer.keyClass(classOf[IntWritable]),
+        SequenceFile.Writer.valueClass(classOf[Text])
+      )
      try {
        var start: Long = System.currentTimeMillis
        while (from < now) {
@ -153,16 +180,16 @@ object ImportDatacite {
          val key: IntWritable = new IntWritable(i)
          val value: Text = new Text
          while (client.hasNext) {
-            key.set({
+            key.set {
              i += 1;
              i - 1
-            })
+            }
            value.set(client.next())
            writer.append(key, value)
            writer.hflush()
            if (i % 1000 == 0) {
              end = System.currentTimeMillis
-              val time = (end - start) / 1000.0F
+              val time = (end - start) / 1000.0f
              println(s"Imported $i in $time seconds")
              start = System.currentTimeMillis
            }
@ -174,8 +201,7 @@ object ImportDatacite {
        case e: Throwable =>
          println("Error", e)
      } finally if (writer != null) writer.close()
-    }
-    catch {
+    } catch {
      case e: Throwable =>
        log.error("Error", e)
    }
--- a/Show More
+++ b/Show More