resolved conflicts for #165

2021-11-26 16:15:11 +01:00 · 2021-11-26 16:15:11 +01:00 · 1de881b796
parent 3f9b2ba8ce 014e872ae1
commit 1de881b796
304 changed files with 18805 additions and 2237 deletions
--- a/dhp-build/dhp-code-style/pom.xml
+++ b/dhp-build/dhp-code-style/pom.xml
@ -22,9 +22,20 @@
            <id>dnet45-releases</id>
            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
        </repository>
        <site>
            <id>DHPSite</id>
            <url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
        </site>
    </distributionManagement>
    <build>
        <extensions>
            <extension>
                <groupId>org.apache.maven.wagon</groupId>
                <artifactId>wagon-ssh</artifactId>
                <version>2.10</version>
            </extension>
        </extensions>
        <pluginManagement>
            <plugins>
                <plugin>
@ -35,7 +46,7 @@
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-site-plugin</artifactId>
-                    <version>3.7.1</version>
+                    <version>3.9.1</version>
                </plugin>
            </plugins>
        </pluginManagement>
@ -43,6 +54,7 @@
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
    </properties>
 </project>
--- a/dhp-build/dhp-code-style/src/site/site.xml
+++ b/dhp-build/dhp-code-style/src/site/site.xml
@ -0,0 +1,21 @@
 <?xml version="1.0" encoding="ISO-8859-1"?>
 <project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
         name="DHP-Aggregation">
    <skin>
        <groupId>org.apache.maven.skins</groupId>
        <artifactId>maven-fluido-skin</artifactId>
        <version>1.8</version>
    </skin>
    <poweredBy>
        <logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
              img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
    </poweredBy>
    <body>
        <links>
            <item name="Code" href="https://code-repo.d4science.org/" />
        </links>
        <menu ref="modules" />
        <menu ref="reports"/>
    </body>
 </project>
--- a/dhp-build/pom.xml
+++ b/dhp-build/pom.xml
@ -10,6 +10,9 @@
 	<packaging>pom</packaging>
 	<description>This module is a container for the build tools used in dnet-hadoop</description>
 	<properties>
 		<maven.javadoc.skip>true</maven.javadoc.skip>
 	</properties>
 	<modules>
 		<module>dhp-code-style</module>
@ -17,4 +20,12 @@
 		<module>dhp-build-properties-maven-plugin</module>
 	</modules>
 <distributionManagement>
 	 <site>
 		 <id>DHPSite</id>
 		 <url>${dhp.site.stage.path}/dhp-build/</url>
 	 </site>
 </distributionManagement>
 </project>
--- a/dhp-build/src/site/site.xml
+++ b/dhp-build/src/site/site.xml
@ -0,0 +1,22 @@
 <?xml version="1.0" encoding="ISO-8859-1"?>
 <project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
         name="DHP-Aggregation">
    <skin>
        <groupId>org.apache.maven.skins</groupId>
        <artifactId>maven-fluido-skin</artifactId>
        <version>1.8</version>
    </skin>
    <poweredBy>
        <logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
              img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
    </poweredBy>
    <body>
        <links>
            <item name="Code" href="https://code-repo.d4science.org/" />
        </links>
        <menu ref="modules" />
        <menu ref="reports"/>
    </body>
 </project>
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -13,7 +13,51 @@
 	<artifactId>dhp-common</artifactId>
 	<packaging>jar</packaging>
 	<distributionManagement>
 		<site>
 			<id>DHPSite</id>
 			<url>${dhp.site.stage.path}/dhp-common</url>
 		</site>
 	</distributionManagement>
 	<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
 	<build>
 		<plugins>
 			<plugin>
 				<groupId>net.alchim31.maven</groupId>
 				<artifactId>scala-maven-plugin</artifactId>
 				<version>${net.alchim31.maven.version}</version>
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
 						<phase>initialize</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
 						</goals>
 					</execution>
 					<execution>
 						<id>scala-test-compile</id>
 						<phase>process-test-resources</phase>
 						<goals>
 							<goal>testCompile</goal>
 						</goals>
 					</execution>
 					<execution>
 						<id>scala-doc</id>
 						<phase>process-resources</phase> <!-- or wherever -->
 						<goals>
 							<goal>doc</goal>
 						</goals>
 					</execution>
 				</executions>
 				<configuration>
 					<scalaVersion>${scala.version}</scalaVersion>
 				</configuration>
 			</plugin>
 		</plugins>
 	</build>
 	<dependencies>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -0,0 +1,72 @@
 package eu.dnetlib.dhp.application
 import scala.io.Source
 /**
 * This is the main Interface SparkApplication
 * where all the Spark Scala class should inherit
 *
 */
 trait SparkScalaApplication {
  /**
   * This is the path in the classpath of the json
   * describes all the argument needed to run
   */
  val propertyPath: String
  /**
   * Utility to parse the arguments using the
   * property json in the classpath identified from
   * the variable propertyPath
   *
   * @param args the list of arguments
   */
  def parseArguments(args: Array[String]): ArgumentApplicationParser = {
    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
    parser.parseArgument(args)
    parser
  }
  /**
   * Here all the spark applications runs this method
   * where the whole logic of the spark node is defined
   */
  def run(): Unit
 }
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
 import org.slf4j.Logger
 abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends  SparkScalaApplication {
  var parser: ArgumentApplicationParser = null
  var spark:SparkSession = null
  def initialize():SparkScalaApplication = {
    parser = parseArguments(args)
    spark = createSparkSession()
    this
  }
  /**
   * Utility for creating a spark session starting from parser
   *
   * @return a spark Session
   */
  private def createSparkSession():SparkSession = {
    require(parser!= null)
    val conf:SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
    SparkSession.builder().config(conf)
      .appName(getClass.getSimpleName)
      .master(master)
      .getOrCreate()
  }
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java
@ -28,7 +28,7 @@ public class HdfsSupport {
 	 * @param configuration Configuration of hadoop env
 	 */
 	public static boolean exists(String path, Configuration configuration) {
-		logger.info("Removing path: {}", path);
+		logger.info("Checking existence for path: {}", path);
 		return rethrowAsRuntimeException(
 			() -> {
 				Path f = new Path(path);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
@ -85,6 +85,13 @@ public class MakeTarArchive implements Serializable {
 		String p_string = p.toString();
 		if (!p_string.endsWith("_SUCCESS")) {
 			String name = p_string.substring(p_string.lastIndexOf("/") + 1);
 			if (name.startsWith("part-") & name.length() > 10) {
 				String tmp = name.substring(0, 10);
 				if (name.contains(".")) {
 					tmp += name.substring(name.indexOf("."));
 				}
 				name = tmp;
 			}
 			TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
 			entry.setSize(fileStatus.getLen());
 			current_size += fileStatus.getLen();
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -27,8 +27,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 	public static final int ORCID_LEN = 19;
 	public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
 	public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
-	public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
+
-	public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
+	public static final String TITLE_TEST = "test";
 	public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
 	public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
 	public static <T extends Oaf> T fixVocabularyNames(T value) {
 		if (value instanceof Datasource) {
@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 									final String title = sp
 										.getValue()
 										.toLowerCase();
-									final String residual = Unidecode
+									final String decoded = Unidecode.decode(title);
-										.decode(title)
+
-										.replaceAll(TITLE_FILTER_REGEX, "");
+									if (StringUtils.contains(decoded, TITLE_TEST)) {
-									return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
+										return decoded
 											.replaceAll(TITLE_FILTER_REGEX, "")
 											.length() > TITLE_FILTER_RESIDUAL_LENGTH;
 									}
 									return !decoded
 										.replaceAll("\\W|\\d", "")
 										.isEmpty();
 								})
 							.map(GraphCleaningFunctions::cleanValue)
 							.collect(Collectors.toList()));
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@ -4,19 +4,19 @@ package eu.dnetlib.dhp.utils;
 import java.io.*;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
-import java.util.List;
+import java.util.*;
-import java.util.Map;
+import java.util.stream.Collectors;
 import java.util.Properties;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.codec.binary.Base64OutputStream;
 import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.SaveMode;
 import org.slf4j.Logger;
@ -26,6 +26,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
 import com.jayway.jsonpath.JsonPath;
 import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
 import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
 import net.minidev.json.JSONArray;
 import scala.collection.JavaConverters;
 import scala.collection.Seq;
@ -52,10 +54,56 @@ public class DHPUtils {
 		}
 	}
 	/**
 	 * Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
 	 * by he given format, layout, interpretation
 	 * @param mdstoreManagerUrl the URL of the mdstore manager service
 	 * @param format the mdstore format
 	 * @param layout the mdstore layout
 	 * @param interpretation the mdstore interpretation
 	 * @param includeEmpty include Empty mdstores
 	 * @return the set of hdfs paths
 	 * @throws IOException in case of HTTP communication issues
 	 */
 	public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
 		final String format,
 		final String layout,
 		final String interpretation,
 		boolean includeEmpty) throws IOException {
 		final String url = mdstoreManagerUrl + "/mdstores/";
 		final ObjectMapper objectMapper = new ObjectMapper();
 		final HttpGet req = new HttpGet(url);
 		try (final CloseableHttpClient client = HttpClients.createDefault()) {
 			try (final CloseableHttpResponse response = client.execute(req)) {
 				final String json = IOUtils.toString(response.getEntity().getContent());
 				final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
 				return Arrays
 					.stream(mdstores)
 					.filter(md -> md.getFormat().equalsIgnoreCase(format))
 					.filter(md -> md.getLayout().equalsIgnoreCase(layout))
 					.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
 					.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
 					.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
 					.filter(md -> includeEmpty || md.getSize() > 0)
 					.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
 					.collect(Collectors.toSet());
 			}
 		}
 	}
 	public static String generateIdentifier(final String originalId, final String nsPrefix) {
 		return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
 	}
 	public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
 		final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
 		return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
 	}
 	public static String getJPathString(final String jsonPath, final String json) {
 		try {
 			Object o = JsonPath.read(json, jsonPath);
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -107,7 +107,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -159,7 +159,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -99,7 +99,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2560
+                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@ -29,6 +29,13 @@
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-doc</id>
                        <phase>process-resources</phase> <!-- or wherever -->
                        <goals>
                            <goal>doc</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/Constants.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/Constants.java
@ -0,0 +1,49 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import java.util.Optional;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class Constants {
 	public static final String DOI = "doi";
 	public static final String UPDATE_DATA_INFO_TYPE = "update";
 	public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
 	public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
 	public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
 	public static final String FOS_CLASS_ID = "FOS";
 	public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
 	public static final String NULL = "NULL";
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private Constants() {
 	}
 	public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
 		return Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 	}
 	public static <R> Dataset<R> readPath(
 		SparkSession spark, String inputPath, Class<R> clazz) {
 		return spark
 			.read()
 			.textFile(inputPath)
 			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSData.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSData.java
@ -0,0 +1,77 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Serializable;
 import java.util.Objects;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.collection.GetCSV;
 public class GetFOSData implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(GetFOSData.class);
 	public static final char DEFAULT_DELIMITER = '\t';
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					Objects
 						.requireNonNull(
 							GetFOSData.class
 								.getResourceAsStream(
 									"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json"))));
 		parser.parseArgument(args);
 		// the path where the original fos csv file is stored
 		final String sourcePath = parser.get("sourcePath");
 		log.info("sourcePath {}", sourcePath);
 		// the path where to put the file as json
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}", outputPath);
 		final String hdfsNameNode = parser.get("hdfsNameNode");
 		log.info("hdfsNameNode {}", hdfsNameNode);
 		final String classForName = parser.get("classForName");
 		log.info("classForName {}", classForName);
 		final char delimiter = Optional
 			.ofNullable(parser.get("delimiter"))
 			.map(s -> s.charAt(0))
 			.orElse(DEFAULT_DELIMITER);
 		log.info("delimiter {}", delimiter);
 		Configuration conf = new Configuration();
 		conf.set("fs.defaultFS", hdfsNameNode);
 		FileSystem fileSystem = FileSystem.get(conf);
 		new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem);
 	}
 	public void doRewrite(String inputPath, String outputFile, String classForName, char delimiter, FileSystem fs)
 		throws IOException, ClassNotFoundException {
 		// reads the csv and writes it as its json equivalent
 		try (InputStreamReader reader = new InputStreamReader(fs.open(new Path(inputPath)))) {
 			GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
 		}
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
@ -0,0 +1,145 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
 import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.UPDATE_CLASS_NAME;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.hdfs.client.HdfsUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipDeserialize;
 import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Measure;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.utils.DHPUtils;
 public class PrepareBipFinder implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static <I extends Result> void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				PrepareBipFinder.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String sourcePath = parser.get("sourcePath");
 		log.info("sourcePath {}: ", sourcePath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}: ", outputPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
 				prepareResults(spark, sourcePath, outputPath);
 			});
 	}
 	private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath) {
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
 			.textFile(inputPath)
 			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
 		spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
 				BipScore bs = new BipScore();
 				bs.setId(key);
 				bs.setScoreList(entry.get(key));
 				return bs;
 			}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
 			.map((MapFunction<BipScore, Result>) v -> {
 				Result r = new Result();
 				r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
 				r.setMeasures(getMeasure(v));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(outputPath + "/bip");
 	}
 	private static List<Measure> getMeasure(BipScore value) {
 		return value
 			.getScoreList()
 			.stream()
 			.map(score -> {
 				Measure m = new Measure();
 				m.setId(score.getId());
 				m
 					.setUnit(
 						score
 							.getUnit()
 							.stream()
 							.map(unit -> {
 								KeyValue kv = new KeyValue();
 								kv.setValue(unit.getValue());
 								kv.setKey(unit.getKey());
 								kv
 									.setDataInfo(
 										OafMapperUtils
 											.dataInfo(
 												false,
 												UPDATE_DATA_INFO_TYPE,
 												true,
 												false,
 												OafMapperUtils
 													.qualifier(
 														UPDATE_MEASURE_BIP_CLASS_ID,
 														UPDATE_CLASS_NAME,
 														ModelConstants.DNET_PROVENANCE_ACTIONS,
 														ModelConstants.DNET_PROVENANCE_ACTIONS),
 												""));
 								return kv;
 							})
 							.collect(Collectors.toList()));
 				return m;
 			})
 			.collect(Collectors.toList());
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java
@ -0,0 +1,133 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.utils.DHPUtils;
 public class PrepareFOSSparkJob implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				PrepareFOSSparkJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		String sourcePath = parser.get("sourcePath");
 		log.info("sourcePath: {}", sourcePath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				distributeFOSdois(
 					spark,
 					sourcePath,
 					outputPath);
 			});
 	}
 	private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) {
 		Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class);
 		fosDataset.flatMap((FlatMapFunction<FOSDataModel, FOSDataModel>) v -> {
 			List<FOSDataModel> fosList = new ArrayList<>();
 			final String level1 = v.getLevel1();
 			final String level2 = v.getLevel2();
 			final String level3 = v.getLevel3();
 			Arrays
 				.stream(v.getDoi().split("\u0002"))
 				.forEach(d -> fosList.add(FOSDataModel.newInstance(d, level1, level2, level3)));
 			return fosList.iterator();
 		}, Encoders.bean(FOSDataModel.class))
 			.map((MapFunction<FOSDataModel, Result>) value -> {
 				Result r = new Result();
 				r.setId(DHPUtils.generateUnresolvedIdentifier(value.getDoi(), DOI));
 				r.setSubject(getSubjects(value));
 				return r;
 			}, Encoders.bean(Result.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(outputPath + "/fos");
 	}
 	private static List<StructuredProperty> getSubjects(FOSDataModel fos) {
 		return Arrays
 			.asList(getSubject(fos.getLevel1()), getSubject(fos.getLevel2()), getSubject(fos.getLevel3()))
 			.stream()
 			.filter(Objects::nonNull)
 			.collect(Collectors.toList());
 	}
 	private static StructuredProperty getSubject(String sbj) {
 		if (sbj.equals(NULL))
 			return null;
 		StructuredProperty sp = new StructuredProperty();
 		sp.setValue(sbj);
 		sp
 			.setQualifier(
 				OafMapperUtils
 					.qualifier(
 						FOS_CLASS_ID,
 						FOS_CLASS_NAME,
 						ModelConstants.DNET_SUBJECT_TYPOLOGIES,
 						ModelConstants.DNET_SUBJECT_TYPOLOGIES));
 		sp
 			.setDataInfo(
 				OafMapperUtils
 					.dataInfo(
 						false,
 						UPDATE_DATA_INFO_TYPE,
 						true,
 						false,
 						OafMapperUtils
 							.qualifier(
 								UPDATE_SUBJECT_FOS_CLASS_ID,
 								UPDATE_CLASS_NAME,
 								ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						""));
 		return sp;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java
@ -0,0 +1,79 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class SparkSaveUnresolved implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				PrepareFOSSparkJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		String sourcePath = parser.get("sourcePath");
 		log.info("sourcePath: {}", sourcePath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				saveUnresolved(
 					spark,
 					sourcePath,
 					outputPath);
 			});
 	}
 	private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) {
 		spark
 			.read()
 			.textFile(sourcePath + "/*")
 			.map(
 				(MapFunction<String, Result>) l -> OBJECT_MAPPER.readValue(l, Result.class),
 				Encoders.bean(Result.class))
 			.groupByKey((MapFunction<Result, String>) r -> r.getId(), Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
 				Result ret = it.next();
 				it.forEachRemaining(r -> ret.mergeFrom(r));
 				return ret;
 			}, Encoders.bean(Result.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(outputPath);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipDeserialize.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipDeserialize.java
@ -0,0 +1,28 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 /**
 * Class that maps the model of the bipFinder! input data.
 * Only needed for deserialization purposes
 */
 public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
 	public BipDeserialize() {
 		super();
 	}
 	public List<Score> get(String key) {
 		if (super.get(key) == null) {
 			return new ArrayList<>();
 		}
 		return super.get(key);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipScore.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipScore.java
@ -0,0 +1,30 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
 import java.io.Serializable;
 import java.util.List;
 /**
 * Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
 */
 public class BipScore implements Serializable {
 	private String id; // doi
 	private List<Score> scoreList; // unit as given in the inputfile
 	public String getId() {
 		return id;
 	}
 	public void setId(String id) {
 		this.id = id;
 	}
 	public List<Score> getScoreList() {
 		return scoreList;
 	}
 	public void setScoreList(List<Score> scoreList) {
 		this.scoreList = scoreList;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java
@ -0,0 +1,71 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
 import java.io.Serializable;
 import com.opencsv.bean.CsvBindByPosition;
 public class FOSDataModel implements Serializable {
 	@CsvBindByPosition(position = 1)
 //    @CsvBindByName(column = "doi")
 	private String doi;
 	@CsvBindByPosition(position = 2)
 //    @CsvBindByName(column = "level1")
 	private String level1;
 	@CsvBindByPosition(position = 3)
 //    @CsvBindByName(column = "level2")
 	private String level2;
 	@CsvBindByPosition(position = 4)
 //    @CsvBindByName(column = "level3")
 	private String level3;
 	public FOSDataModel() {
 	}
 	public FOSDataModel(String doi, String level1, String level2, String level3) {
 		this.doi = doi;
 		this.level1 = level1;
 		this.level2 = level2;
 		this.level3 = level3;
 	}
 	public static FOSDataModel newInstance(String d, String level1, String level2, String level3) {
 		return new FOSDataModel(d, level1, level2, level3);
 	}
 	public String getDoi() {
 		return doi;
 	}
 	public void setDoi(String doi) {
 		this.doi = doi;
 	}
 	public String getLevel1() {
 		return level1;
 	}
 	public void setLevel1(String level1) {
 		this.level1 = level1;
 	}
 	public String getLevel2() {
 		return level2;
 	}
 	public void setLevel2(String level2) {
 		this.level2 = level2;
 	}
 	public String getLevel3() {
 		return level3;
 	}
 	public void setLevel3(String level3) {
 		this.level3 = level3;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/KeyValue.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/KeyValue.java
@ -0,0 +1,26 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
 import java.io.Serializable;
 public class KeyValue implements Serializable {
 	private String key;
 	private String value;
 	public String getKey() {
 		return key;
 	}
 	public void setKey(String key) {
 		this.key = key;
 	}
 	public String getValue() {
 		return value;
 	}
 	public void setValue(String value) {
 		this.value = value;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/Score.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/Score.java
@ -0,0 +1,30 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
 import java.io.Serializable;
 import java.util.List;
 /**
 * represents the score in the input file
 */
 public class Score implements Serializable {
 	private String id;
 	private List<KeyValue> unit;
 	public String getId() {
 		return id;
 	}
 	public void setId(String id) {
 		this.id = id;
 	}
 	public List<KeyValue> getUnit() {
 		return unit;
 	}
 	public void setUnit(List<KeyValue> unit) {
 		this.unit = unit;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala
@ -1,41 +0,0 @@
 package eu.dnetlib.dhp.actionmanager.datacite
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import org.apache.hadoop.io.Text
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.hadoop.mapred.SequenceFileOutputFormat
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import scala.io.Source
 object ExportActionSetJobNode {
  val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf
    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
    parser.parseArgument(args)
    val master = parser.get("master")
    val sourcePath = parser.get("sourcePath")
    val targetPath = parser.get("targetPath")
    val spark: SparkSession = SparkSession.builder().config(conf)
      .appName(ExportActionSetJobNode.getClass.getSimpleName)
      .master(master)
      .getOrCreate()
    implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
    spark.read.load(sourcePath).as[Oaf]
      .map(o =>DataciteToOAFTransformation.toActionSet(o))
      .filter(o => o!= null)
      .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala
@ -1,46 +0,0 @@
 package eu.dnetlib.dhp.actionmanager.datacite
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import scala.io.Source
 object FilterCrossrefEntitiesSpark {
  val log: Logger = LoggerFactory.getLogger(getClass.getClass)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf
    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
    parser.parseArgument(args)
    val master = parser.get("master")
    val sourcePath = parser.get("sourcePath")
    log.info("sourcePath: {}", sourcePath)
    val targetPath = parser.get("targetPath")
    log.info("targetPath: {}", targetPath)
    val spark: SparkSession = SparkSession.builder().config(conf)
      .appName(getClass.getSimpleName)
      .master(master)
      .getOrCreate()
    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
    val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
    d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala
@ -1,48 +0,0 @@
 package eu.dnetlib.dhp.actionmanager.datacite
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import scala.io.Source
 object GenerateDataciteDatasetSpark {
  val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf
    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
    parser.parseArgument(args)
    val master = parser.get("master")
    val sourcePath = parser.get("sourcePath")
    val targetPath = parser.get("targetPath")
    val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
    val isLookupUrl: String = parser.get("isLookupUrl")
    log.info("isLookupUrl: {}", isLookupUrl)
    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
    val spark: SparkSession = SparkSession.builder().config(conf)
      .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
      .master(master)
      .getOrCreate()
    implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
    implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    import spark.implicits._
    spark.read.load(sourcePath).as[DataciteType]
      .filter(d => d.isActive)
      .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
      .filter(d => d != null)
      .write.mode(SaveMode.Overwrite).save(targetPath)
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java
@ -0,0 +1,181 @@
 package eu.dnetlib.dhp.actionmanager.opencitations;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.*;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import scala.Tuple2;
 public class CreateActionSetSparkJob implements Serializable {
 	public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
 	public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
 	private static final String ID_PREFIX = "50|doi_________::";
 	private static final String TRUST = "0.91";
 	private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static void main(final String[] args) throws IOException, ParseException {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					Objects
 						.requireNonNull(
 							CreateActionSetSparkJob.class
 								.getResourceAsStream(
 									"/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json"))));
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath {}", inputPath.toString());
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}", outputPath);
 		final boolean shouldDuplicateRels = Optional
 			.ofNullable(parser.get("shouldDuplicateRels"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.FALSE);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				extractContent(spark, inputPath, outputPath, shouldDuplicateRels);
 			});
 	}
 	private static void extractContent(SparkSession spark, String inputPath, String outputPath,
 		boolean shouldDuplicateRels) {
 		spark
 			.sqlContext()
 			.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
 			.flatMap(
 				(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
 				Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) value -> value != null)
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
 	}
 	private static List<Relation> createRelation(String value, boolean duplicate) {
 		String[] line = value.split(",");
 		if (!line[1].startsWith("10.")) {
 			return new ArrayList<>();
 		}
 		List<Relation> relationList = new ArrayList<>();
 		String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
 		final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
 		relationList
 			.addAll(
 				getRelations(
 					citing,
 					cited));
 		if (duplicate && line[1].endsWith(".refs")) {
 			citing = ID_PREFIX + IdentifierFactory
 				.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
 			relationList.addAll(getRelations(citing, cited));
 		}
 		return relationList;
 	}
 	private static Collection<Relation> getRelations(String citing, String cited) {
 		return Arrays
 			.asList(
 				getRelation(citing, cited, ModelConstants.CITES),
 				getRelation(cited, citing, ModelConstants.IS_CITED_BY));
 	}
 	public static Relation getRelation(
 		String source,
 		String target,
 		String relclass) {
 		Relation r = new Relation();
 		r.setCollectedfrom(getCollectedFrom());
 		r.setSource(source);
 		r.setTarget(target);
 		r.setRelClass(relclass);
 		r.setRelType(ModelConstants.RESULT_RESULT);
 		r.setSubRelType(ModelConstants.CITATION);
 		r
 			.setDataInfo(
 				getDataInfo());
 		return r;
 	}
 	public static List<KeyValue> getCollectedFrom() {
 		KeyValue kv = new KeyValue();
 		kv.setKey(ModelConstants.OPENOCITATIONS_ID);
 		kv.setValue(ModelConstants.OPENOCITATIONS_NAME);
 		return Arrays.asList(kv);
 	}
 	public static DataInfo getDataInfo() {
 		DataInfo di = new DataInfo();
 		di.setInferred(false);
 		di.setDeletedbyinference(false);
 		di.setTrust(TRUST);
 		di
 			.setProvenanceaction(
 				getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS));
 		return di;
 	}
 	public static Qualifier getQualifier(String class_id, String class_name,
 		String qualifierSchema) {
 		Qualifier pa = new Qualifier();
 		pa.setClassid(class_id);
 		pa.setClassname(class_name);
 		pa.setSchemeid(qualifierSchema);
 		pa.setSchemename(qualifierSchema);
 		return pa;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java
@ -0,0 +1,93 @@
 package eu.dnetlib.dhp.actionmanager.opencitations;
 import java.io.*;
 import java.io.Serializable;
 import java.util.Objects;
 import java.util.zip.GZIPOutputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class GetOpenCitationsRefs implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class);
 	public static void main(final String[] args) throws IOException, ParseException {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					Objects
 						.requireNonNull(
 							GetOpenCitationsRefs.class
 								.getResourceAsStream(
 									"/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json"))));
 		parser.parseArgument(args);
 		final String[] inputFile = parser.get("inputFile").split(";");
 		log.info("inputFile {}", inputFile.toString());
 		final String workingPath = parser.get("workingPath");
 		log.info("workingPath {}", workingPath);
 		final String hdfsNameNode = parser.get("hdfsNameNode");
 		log.info("hdfsNameNode {}", hdfsNameNode);
 		Configuration conf = new Configuration();
 		conf.set("fs.defaultFS", hdfsNameNode);
 		FileSystem fileSystem = FileSystem.get(conf);
 		GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
 		for (String file : inputFile) {
 			ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
 		}
 	}
 	private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
 		throws IOException {
 		final Path path = new Path(inputFile);
 		FSDataInputStream oc_zip = fileSystem.open(path);
 		int count = 1;
 		try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
 			ZipEntry entry = null;
 			while ((entry = zis.getNextEntry()) != null) {
 				if (!entry.isDirectory()) {
 					String fileName = entry.getName();
 					fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
 					count++;
 					try (
 						FSDataOutputStream out = fileSystem
 							.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
 						GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
 						IOUtils.copy(zis, gzipOs);
 					}
 				}
 			}
 		}
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
@ -60,14 +60,10 @@ object SparkCreateActionset {
    val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
    entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
    entities
      .joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
      .map(p => p._1._2)
      .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala
@ -0,0 +1,49 @@
 package eu.dnetlib.dhp.collection
 import eu.dnetlib.dhp.schema.common.ModelSupport
 import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
 object CollectionUtils {
  /**
   * This method in pipeline to the transformation phase,
   * generates relations in both verse, typically it should be a phase of flatMap
   *
   * @param i input OAF
   * @return
   * If the input OAF is an entity -> List(i)
   * If the input OAF is a relation -> List(relation, inverseRelation)
   *
   */
  def fixRelations(i: Oaf): List[Oaf] = {
    if (i.isInstanceOf[OafEntity])
      return List(i)
    else {
      val r: Relation = i.asInstanceOf[Relation]
      val currentRel = ModelSupport.findRelation(r.getRelClass)
      if (currentRel != null) {
        // Cleaning relation
        r.setRelType(currentRel.getRelType)
        r.setSubRelType(currentRel.getSubReltype)
        r.setRelClass(currentRel.getRelClass)
        val inverse = new Relation
        inverse.setSource(r.getTarget)
        inverse.setTarget(r.getSource)
        inverse.setRelType(currentRel.getRelType)
        inverse.setSubRelType(currentRel.getSubReltype)
        inverse.setRelClass(currentRel.getInverseRelClass)
        inverse.setCollectedfrom(r.getCollectedfrom)
        inverse.setDataInfo(r.getDataInfo)
        inverse.setProperties(r.getProperties)
        inverse.setLastupdatetimestamp(r.getLastupdatetimestamp)
        inverse.setValidated(r.getValidated)
        inverse.setValidationDate(r.getValidationDate)
        return List(r, inverse)
      }
    }
    List()
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
@ -1,12 +1,10 @@
-package eu.dnetlib.dhp.actionmanager.datacite
+package eu.dnetlib.dhp.datacite
 import org.apache.commons.io.IOUtils
 import org.apache.http.client.config.RequestConfig
-import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
+import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
 import org.apache.http.entity.StringEntity
-import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}
+import org.apache.http.impl.client.HttpClientBuilder
 import java.io.IOException
 abstract class AbstractRestClient extends Iterator[String] {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala
@ -1,7 +1,7 @@
-package eu.dnetlib.dhp.actionmanager.datacite
+package eu.dnetlib.dhp.datacite
 import org.json4s.{DefaultFormats, JValue}
 import org.json4s.jackson.JsonMethods.{compact, parse, render}
 import org.json4s.{DefaultFormats, JValue}
 class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
@ -0,0 +1,134 @@
 package eu.dnetlib.dhp.datacite
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
 import java.io.InputStream
 import java.time.format.DateTimeFormatter
 import java.util.Locale
 import java.util.regex.Pattern
 import scala.io.Source
 /**
 * This class represent the dataModel of the input Dataset of Datacite
 * @param doi THE DOI
 * @param timestamp timestamp of last update date
 * @param isActive the record is active or deleted
 * @param json the json native records
 */
 case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
 /*
  The following class are utility class used for the mapping from
  json datacite to OAF Shema
 */
 case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
 case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
 case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
 case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
 case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
 case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
 case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
 case class DateType(date: Option[String], dateType: Option[String]) {}
 case class OAFRelations(relation:String, inverse:String, relType:String)
 class DataciteModelConstants extends Serializable {
 }
 object DataciteModelConstants {
  val REL_TYPE_VALUE:String = "resultResult"
  val DATE_RELATION_KEY = "RelationDate"
  val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
  val DOI_CLASS = "doi"
  val SUBJ_CLASS = "keywords"
  val DATACITE_NAME = "Datacite"
  val dataInfo: DataInfo = dataciteDataInfo("0.9")
  val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
  val subRelTypeMapping: Map[String,OAFRelations] = Map(
    ModelConstants.REFERENCES ->            OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_REFERENCED_BY ->      OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_SUPPLEMENTED_BY ->    OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
    ModelConstants.IS_SUPPLEMENT_TO ->      OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
    ModelConstants.HAS_PART ->              OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
    ModelConstants.IS_PART_OF ->            OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
    ModelConstants.IS_VERSION_OF->          OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
    ModelConstants.HAS_VERSION->            OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
    ModelConstants.IS_IDENTICAL_TO ->       OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_CONTINUED_BY ->       OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
    ModelConstants.CONTINUES ->             OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_NEW_VERSION_OF->      OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
    ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
    ModelConstants.IS_DOCUMENTED_BY ->      OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
    ModelConstants.DOCUMENTS ->             OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_SOURCE_OF ->          OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
    ModelConstants.IS_DERIVED_FROM ->       OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
    ModelConstants.CITES ->                 OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
    ModelConstants.IS_CITED_BY ->           OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
    ModelConstants.IS_VARIANT_FORM_OF ->    OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
    ModelConstants.IS_OBSOLETED_BY ->       OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
    ModelConstants.REVIEWS ->               OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
    ModelConstants.IS_REVIEWED_BY ->        OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
    ModelConstants.DOCUMENTS ->             OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_DOCUMENTED_BY ->      OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
    ModelConstants.COMPILES ->              OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
    ModelConstants.IS_COMPILED_BY ->        OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
  )
  val datacite_filter: List[String] = {
    val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
    require(stream!= null)
    Source.fromInputStream(stream).getLines().toList
  }
  def dataciteDataInfo(trust: String): DataInfo =  OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
  val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
  val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
  val funder_regex: List[(Pattern, String)] = List(
    (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
    (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
  )
  val Date_regex: List[Pattern] = List(
    //Y-M-D
    Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
    //M-D-Y
    Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
    //D-M-Y
    Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
    //Y
    Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
  )
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
@ -1,7 +1,8 @@
-package eu.dnetlib.dhp.actionmanager.datacite
+package eu.dnetlib.dhp.datacite
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.datacite.DataciteModelConstants._
 import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
@ -12,121 +13,30 @@ import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
 import java.nio.charset.CodingErrorAction
 import java.text.SimpleDateFormat
 import java.time.LocalDate
 import java.time.chrono.ThaiBuddhistDate
 import java.time.format.DateTimeFormatter
 import java.util.regex.Pattern
 import java.util.{Date, Locale}
 import scala.collection.JavaConverters._
 import scala.io.{Codec, Source}
 import scala.language.postfixOps
 case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
 case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
 case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
 case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
 case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
 case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
 case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
 case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
 case class DateType(date: Option[String], dateType: Option[String]) {}
 case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
 object DataciteToOAFTransformation {
  val REL_TYPE_VALUE:String = "resultResult"
  val DATE_RELATION_KEY = "RelationDate"
  val subRelTypeMapping: Map[String,(String,String)] = Map(
    "References" ->("IsReferencedBy","relationship"),
    "IsSupplementTo" ->("IsSupplementedBy","supplement"),
    "IsPartOf" ->("HasPart","part"),
    "HasPart" ->("IsPartOf","part"),
    "IsVersionOf" ->("HasVersion","version"),
    "HasVersion" ->("IsVersionOf","version"),
    "IsIdenticalTo" ->("IsIdenticalTo","relationship"),
    "IsPreviousVersionOf" ->("IsNewVersionOf","version"),
    "IsContinuedBy" ->("Continues","relationship"),
    "Continues" ->("IsContinuedBy","relationship"),
    "IsNewVersionOf" ->("IsPreviousVersionOf","version"),
    "IsSupplementedBy" ->("IsSupplementTo","supplement"),
    "IsDocumentedBy" ->("Documents","relationship"),
    "IsSourceOf" ->("IsDerivedFrom","relationship"),
    "Cites" ->("IsCitedBy","citation"),
    "IsCitedBy" ->("Cites","citation"),
    "IsDerivedFrom" ->("IsSourceOf","relationship"),
    "IsVariantFormOf" ->("IsDerivedFrom","version"),
    "IsReferencedBy" ->("References","relationship"),
    "IsObsoletedBy" ->("IsNewVersionOf","version"),
    "Reviews" ->("IsReviewedBy","review"),
    "Documents" ->("IsDocumentedBy","relationship"),
    "IsCompiledBy" ->("Compiles","relationship"),
    "Compiles" ->("IsCompiledBy","relationship"),
    "IsReviewedBy" ->("Reviews","review")
  )
  implicit val codec: Codec = Codec("UTF-8")
  codec.onMalformedInput(CodingErrorAction.REPLACE)
  codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
  val DOI_CLASS = "doi"
  val SUBJ_CLASS = "keywords"
  val j_filter: List[String] = {
    val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
    s.lines.toList
  }
  val mapper = new ObjectMapper()
  val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
  val dataInfo: DataInfo = generateDataInfo("0.9")
  val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
-  val hostedByMap: Map[String, HostedByMapType] = {
+  /**
-    val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
+   * This method should skip record if json contains invalid text
-    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+   * defined in gile datacite_filter
-    lazy val json: org.json4s.JValue = parse(s)
+   * @param json
-    json.extract[Map[String, HostedByMapType]]
+   * @return True if the record should be skipped
-  }
+   */
-
+  def skip_record(json: String): Boolean = {
-  val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
+    datacite_filter.exists(f => json.contains(f))
  val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
  val funder_regex: List[(Pattern, String)] = List(
    (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
    (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
  )
  val Date_regex: List[Pattern] = List(
    //Y-M-D
    Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
    //M-D-Y
    Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
    //D-M-Y
    Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
    //Y
    Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
  )
  def filter_json(json: String): Boolean = {
    j_filter.exists(f => json.contains(f))
  }
  @deprecated("this method will be removed", "dhp")
  def toActionSet(item: Oaf): (String, String) = {
    val mapper = new ObjectMapper()
@ -206,6 +116,8 @@ object DataciteToOAFTransformation {
      case _: Throwable => ""
    }
  }
  def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
    if (resourceType != null && resourceType.nonEmpty) {
      val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -324,10 +236,7 @@ object DataciteToOAFTransformation {
      val p = match_pattern.get._2
      val grantId = m.matcher(awardUri).replaceAll("$2")
      val targetId = s"$p${DHPUtils.md5(grantId)}"
-      List(
+      List( generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo) )
        generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
        generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
      )
    }
    else
      List()
@ -336,7 +245,7 @@ object DataciteToOAFTransformation {
  def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
-    if (filter_json(input))
+    if (skip_record(input))
      return List()
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -515,8 +424,8 @@ object DataciteToOAFTransformation {
    val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
    if (client.isDefined) {
-      val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
+
-      instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
+      instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
      instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
      instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
      instance.setAccessright(access_rights_qualifier)
@ -570,7 +479,7 @@ object DataciteToOAFTransformation {
        rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
        rel.setDataInfo(dataInfo)
-        val subRelType = subRelTypeMapping(r.relationType)._2
+        val subRelType = subRelTypeMapping(r.relationType).relType
        rel.setRelType(REL_TYPE_VALUE)
        rel.setSubRelType(subRelType)
        rel.setRelClass(r.relationType)
@ -580,22 +489,13 @@ object DataciteToOAFTransformation {
        rel.setProperties(List(dateProps).asJava)
        rel.setSource(id)
-        rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}")
+        rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier,r.relatedIdentifierType))
        rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
-        rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut)
+        rel.getCollectedfrom.asScala.map(c => c.getValue).toList
        rel
-      })(collection breakOut)
+      })
  }
  def generateDataInfo(trust: String): DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
    di.setInvisible(false)
    di.setTrust(trust)
    di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
    di
  }
  def generateDSId(input: String): String = {
    val b = StringUtils.substringBefore(input, "::")
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
@ -0,0 +1,94 @@
 package eu.dnetlib.dhp.datacite
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
 import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends  AbstractScalaApplication(propertyPath, args, log:Logger) {
  /**
   * Here all the spark applications runs this method
   * where the whole logic of the spark node is defined
   */
  override def run(): Unit = {
    val sourcePath = parser.get("sourcePath")
    log.info(s"SourcePath is '$sourcePath'")
    val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
    log.info(s"exportLinks is '$exportLinks'")
    val isLookupUrl: String = parser.get("isLookupUrl")
    log.info("isLookupUrl: {}", isLookupUrl)
    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
    require(vocabularies != null)
    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
    log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
    val mapper = new ObjectMapper()
    val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
    log.info(s"outputBasePath is '$outputBasePath'")
    val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
    log.info(s"targetPath is '$targetPath'")
    generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
    reportTotalSize(targetPath, outputBasePath)
  }
  /**
   * For working with MDStore we need to store in a file on hdfs the size of
   * the current dataset
   * @param targetPath
   * @param outputBasePath
   */
  def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
    val total_items = spark.read.load(targetPath).count()
    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
  }
  /**
   * Generate the transformed and cleaned OAF Dataset from the native one
   * @param sourcePath  sourcePath of the native Dataset in format JSON/Datacite
   * @param exportLinks If true it generates unresolved links
   * @param vocabularies vocabularies for cleaning
   * @param targetPath the targetPath of the result Dataset
   */
  def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
    require(spark!= null)
    import spark.implicits._
    implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
    implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    spark.read.load(sourcePath).as[DataciteType]
      .filter(d => d.isActive)
      .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
      .filter(d => d != null)
      .flatMap(i => fixRelations(i)).filter(i => i != null)
      .write.mode(SaveMode.Overwrite).save(targetPath)
  }
 }
 object GenerateDataciteDatasetSpark {
  val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
  def main(args: Array[String]): Unit = {
    new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala
@ -1,6 +1,5 @@
-package eu.dnetlib.dhp.actionmanager.datacite
+package eu.dnetlib.dhp.datacite
 import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
@ -9,14 +8,14 @@ import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.functions.max
 import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import org.apache.spark.sql.functions.max
 import org.slf4j.{Logger, LoggerFactory}
-import java.time.format.DateTimeFormatter._
+import java.time.format.DateTimeFormatter.ISO_DATE_TIME
-import java.time.{LocalDate, LocalDateTime, ZoneOffset}
+import java.time.{LocalDateTime, ZoneOffset}
 import scala.io.Source
 object ImportDatacite {
@ -138,11 +137,11 @@ object ImportDatacite {
    }
  }
-  private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
+  private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
-    var from:Long = timestamp * 1000
+    var from: Long = timestamp * 1000
-    val delta:Long = 100000000L
+    val delta: Long = 100000000L
    var client: DataciteAPIImporter = null
-    val now :Long =System.currentTimeMillis()
+    val now: Long = System.currentTimeMillis()
    var i = 0
    try {
      val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
@ -168,7 +167,7 @@ object ImportDatacite {
              start = System.currentTimeMillis
            }
          }
-          println(s"updating from value: $from  -> ${from+delta}")
+          println(s"updating from value: $from  -> ${from + delta}")
          from = from + delta
        }
      } catch {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/SparkDownloadUpdateDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/SparkDownloadUpdateDatacite.scala
@ -1,18 +1,14 @@
-package eu.dnetlib.dhp.actionmanager.datacite
+package eu.dnetlib.dhp.datacite
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.LocalFileSystem
 import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.apache.spark.sql.functions.max
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import java.text.SimpleDateFormat
-import java.util.{Date, Locale}
+import java.util.Locale
 import scala.io.Source
 object SparkDownloadUpdateDatacite {
@ -21,7 +17,7 @@ object SparkDownloadUpdateDatacite {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf
-    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
+    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
    parser.parseArgument(args)
    val master = parser.get("master")
    val sourcePath = parser.get("sourcePath")
@ -42,9 +38,9 @@ object SparkDownloadUpdateDatacite {
    import spark.implicits._
-    val maxDate:String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
+    val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
-    val string_to_date =ISO8601FORMAT.parse(maxDate)
+    val string_to_date = ISO8601FORMAT.parse(maxDate)
    val ts = string_to_date.getTime
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
@ -1,14 +1,12 @@
-package eu.dnetlib.dhp.sx.graph.bio
+package eu.dnetlib.dhp.sx.bio
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
-import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Instance, KeyValue, Oaf, Relation, StructuredProperty}
+import eu.dnetlib.dhp.schema.oaf._
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.{compact, parse, render}
-
+import collection.JavaConverters._
 import scala.collection.JavaConverters._
 object BioDBToOAF {
  case class EBILinkItem(id: Long, links: String) {}
@ -17,23 +15,23 @@ object BioDBToOAF {
  case class UniprotDate(date: String, date_info: String) {}
-  case class ScholixResolved(pid:String, pidType:String, typology:String, tilte:List[String], datasource:List[String], date:List[String], authors:List[String]){}
+  case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
  val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
  val SUBJ_CLASS = "Keywords"
  val DATE_RELATION_KEY = "RelationDate"
-  val resolvedURL:Map[String,String] = Map(
+  val resolvedURL: Map[String, String] = Map(
-    "genbank"->             "https://www.ncbi.nlm.nih.gov/nuccore/",
+    "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
    "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
    "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
    "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
    "ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
-    "clinicaltrials.gov"->  "https://clinicaltrials.gov/ct2/show/",
+    "clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
-    "onim"->                "https://omim.org/entry/",
+    "onim" -> "https://omim.org/entry/",
-    "refseq"->              "https://www.ncbi.nlm.nih.gov/nuccore/",
+    "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
-    "geo"->                 "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
+    "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
  )
@ -45,7 +43,7 @@ object BioDBToOAF {
    val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
    val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
    val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
-    val pubmedCollectedFrom:KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+    val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
    UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
    PDBCollectedFrom.setDataInfo(DATA_INFO)
@ -58,9 +56,9 @@ object BioDBToOAF {
    Map(
      "uniprot" -> UNIPROTCollectedFrom,
-      "pdb"-> PDBCollectedFrom,
+      "pdb" -> PDBCollectedFrom,
-      "elsevier" ->ElsevierCollectedFrom,
+      "elsevier" -> ElsevierCollectedFrom,
-      "ebi" ->EBICollectedFrom,
+      "ebi" -> EBICollectedFrom,
      "Springer Nature" -> springerNatureCollectedFrom,
      "NCBI Nucleotide" -> ncbiCollectedFrom,
      "European Nucleotide Archive" -> enaCollectedFrom,
@ -68,7 +66,7 @@ object BioDBToOAF {
    )
  }
-  def crossrefLinksToOaf(input:String):Oaf = {
+  def crossrefLinksToOaf(input: String): Oaf = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json = parse(input)
    val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
@ -77,16 +75,16 @@ object BioDBToOAF {
    val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
    val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
-    val relation_semantic= (json \ "RelationshipType" \ "Name").extract[String]
+    val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
    val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
-    createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type),collectedFromMap("elsevier"),"relationship", relation_semantic, date)
+    createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
  }
-  def scholixResolvedToOAF(input:ScholixResolved):Oaf = {
+  def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
    val d = new Dataset
@ -127,14 +125,14 @@ object BioDBToOAF {
    d.setInstance(List(i).asJava)
    if (input.authors != null && input.authors.nonEmpty) {
-      val authors = input.authors.map(a =>{
+      val authors = input.authors.map(a => {
        val authorOAF = new Author
        authorOAF.setFullname(a)
        authorOAF
      })
      d.setAuthor(authors.asJava)
    }
-    if (input.date!= null && input.date.nonEmpty) {
+    if (input.date != null && input.date.nonEmpty) {
      val dt = input.date.head
      i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
      d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
@ -190,7 +188,7 @@ object BioDBToOAF {
          OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
        ).asJava)
    }
-    var i_date:Option[UniprotDate] = None
+    var i_date: Option[UniprotDate] = None
    if (dates.nonEmpty) {
      i_date = dates.find(d => d.date_info.contains("entry version"))
@ -231,13 +229,12 @@ object BioDBToOAF {
  }
-
+  def generate_unresolved_id(pid: String, pidType: String): String = {
  def generate_unresolved_id(pid:String, pidType:String) :String = {
    s"unresolved::$pid::$pidType"
  }
-  def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType:String, relClass:String, date:String):Relation = {
+  def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
    val rel = new Relation
    rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
@ -251,7 +248,7 @@ object BioDBToOAF {
    rel.setTarget(s"unresolved::$pid::$pidType")
-    val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+    val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
    rel.setProperties(List(dateProps).asJava)
@ -262,8 +259,8 @@ object BioDBToOAF {
  }
-  def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date:String): Relation = {
+  def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
-    createRelation(pid,pidType,sourceId,collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
+    createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
  }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala
@ -1,8 +1,9 @@
-package eu.dnetlib.dhp.sx.graph.bio
+package eu.dnetlib.dhp.sx.bio
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import BioDBToOAF.ScholixResolved
 import eu.dnetlib.dhp.collection.CollectionUtils
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
@ -13,7 +14,7 @@ object SparkTransformBioDatabaseToOAF {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    val log: Logger = LoggerFactory.getLogger(getClass)
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/bio_to_oaf_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
    parser.parseArgument(args)
    val database: String = parser.get("database")
    log.info("database: {}", database)
@ -33,16 +34,15 @@ object SparkTransformBioDatabaseToOAF {
    implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
    import spark.implicits._
    database.toUpperCase() match {
      case "UNIPROT" =>
-        spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath)
+        spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
-      case "PDB"=>
+      case "PDB" =>
-        spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
+        spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
      case "SCHOLIX" =>
-        spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).write.mode(SaveMode.Overwrite).save(targetPath)
+        spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
-      case "CROSSREF_LINKS"=>
+      case "CROSSREF_LINKS" =>
-        spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
+        spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
    }
  }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala
@ -1,9 +1,9 @@
-package eu.dnetlib.dhp.sx.graph.ebi
+package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.oaf.Result
-import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -24,24 +24,24 @@ import scala.xml.pull.XMLEventReader
 object SparkCreateBaselineDataFrame {
-  def requestBaseLineUpdatePage(maxFile:String):List[(String,String)] = {
+  def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
-    val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
+    val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
-    val result =data.lines.filter(l => l.startsWith("<a href=")).map{l =>
+    val result = data.lines.filter(l => l.startsWith("<a href=")).map { l =>
      val end = l.lastIndexOf("\">")
      val start = l.indexOf("<a href=\"")
-      if (start>= 0 && end >start)
+      if (start >= 0 && end > start)
-        l.substring(start+9, (end-start))
+        l.substring(start + 9, end - start)
      else
        ""
-    }.filter(s =>s.endsWith(".gz") ).filter(s => s > maxFile).map(s => (s,s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
+    }.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
    result
  }
-  def downloadBaselinePart(url:String):InputStream = {
+  def downloadBaselinePart(url: String): InputStream = {
    val r = new HttpGet(url)
    val timeout = 60; // seconds
    val config = RequestConfig.custom()
@ -55,7 +55,7 @@ object SparkCreateBaselineDataFrame {
  }
-  def requestPage(url:String):String = {
+  def requestPage(url: String): String = {
    val r = new HttpGet(url)
    val timeout = 60; // seconds
    val config = RequestConfig.custom()
@ -90,25 +90,21 @@ object SparkCreateBaselineDataFrame {
  }
-
+  def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
  def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
    val conf = new Configuration
    conf.set("fs.defaultFS", hdfsServerUri)
    val fs = FileSystem.get(conf)
    val p = new Path(baselinePath)
-    val files = fs.listFiles(p,false)
+    val files = fs.listFiles(p, false)
    var max_file = ""
    while (files.hasNext) {
      val c = files.next()
      val data = c.getPath.toString
-      val fileName = data.substring(data.lastIndexOf("/")+1)
+      val fileName = data.substring(data.lastIndexOf("/") + 1)
-      if (fileName> max_file)
+      if (fileName > max_file)
        max_file = fileName
    }
@ -118,11 +114,7 @@ object SparkCreateBaselineDataFrame {
      val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}")
      val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
      val i = downloadBaselinePart(u._2)
-      val buffer = Array.fill[Byte](1024)(0)
+      IOUtils.copy(i, fsDataOutputStream)
      while(i.read(buffer)>0) {
        fsDataOutputStream.write(buffer)
      }
      i.close()
      println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
      fsDataOutputStream.close()
    }
@ -134,11 +126,11 @@ object SparkCreateBaselineDataFrame {
    override def zero: PMArticle = new PMArticle
    override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
-      if (b != null && b.getPmid!= null)   b  else a._2
+      if (b != null && b.getPmid != null) b else a._2
    }
    override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
-      if (b1 != null && b1.getPmid!= null)    b1   else     b2
+      if (b1 != null && b1.getPmid != null) b1 else b2
    }
@ -153,7 +145,7 @@ object SparkCreateBaselineDataFrame {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    val log: Logger = LoggerFactory.getLogger(getClass)
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
    parser.parseArgument(args)
    val isLookupUrl: String = parser.get("isLookupUrl")
    log.info("isLookupUrl: {}", isLookupUrl)
@ -166,6 +158,9 @@ object SparkCreateBaselineDataFrame {
    val hdfsServerUri = parser.get("hdfsServerUri")
    log.info("hdfsServerUri: {}", targetPath)
    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
@ -175,32 +170,31 @@ object SparkCreateBaselineDataFrame {
        .config(conf)
        .appName(SparkEBILinksToOaf.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext
    import spark.implicits._
    implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
    implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
    implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
    implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
-
+      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
-    val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
+      val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
    val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
        val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
        new PMParser(xml)
-
+      }))
-    } ))
+      ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
    ds.map(p => (p.getPmid,p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
        .agg(pmArticleAggregator.toColumn)
        .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
    }
    val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
    exported_dataset
      .map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
-      .filter(p => p!= null)
+      .filter(p => p != null)
      .write.mode(SaveMode.Overwrite).save(targetPath)
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
@ -1,8 +1,9 @@
-package eu.dnetlib.dhp.sx.graph.ebi
+package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
-import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
+import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
 import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
 import org.apache.commons.io.IOUtils
 import org.apache.http.client.config.RequestConfig
 import org.apache.http.client.methods.HttpGet
@ -14,15 +15,15 @@ import org.slf4j.{Logger, LoggerFactory}
 object SparkDownloadEBILinks {
-  def createEBILinks(pmid:Long):EBILinkItem = {
+  def createEBILinks(pmid: Long): EBILinkItem = {
    val res = requestLinks(pmid)
-    if (res!=null)
+    if (res != null)
      return EBILinkItem(pmid, res)
    null
  }
-  def requestPage(url:String):String = {
+  def requestPage(url: String): String = {
    val r = new HttpGet(url)
    val timeout = 60; // seconds
    val config = RequestConfig.custom()
@ -56,16 +57,17 @@ object SparkDownloadEBILinks {
    }
  }
-  def requestLinks(PMID:Long):String = {
+  def requestLinks(PMID: Long): String = {
    requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
  }
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val MAX_ITEM_PER_PARTITION = 20000
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -85,27 +87,27 @@ object SparkDownloadEBILinks {
    val workingPath = parser.get("workingPath")
    log.info(s"workingPath  -> $workingPath")
-    log.info("Getting max pubmedId where the links have been requested")
+    log.info("Getting max pubmedId where the links have already requested")
-    val links:Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
+    val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
-    val lastPMIDRequested =links.map(l => l.id).select(max("value")).first.getLong(0)
+    val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
    log.info("Retrieving PMID to request links")
    val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
    pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
-    val pmidToReq:Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
+    val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
    val total = pmidToReq.count()
-    spark.createDataset(pmidToReq.rdd.repartition((total/MAX_ITEM_PER_PARTITION).toInt).map(pmid =>createEBILinks(pmid)).filter(l => l!= null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
+    spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
-    val updates:Dataset[EBILinkItem] =spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
+    val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
    links.union(updates).groupByKey(_.id)
-      .reduceGroups{(x,y) =>
+      .reduceGroups { (x, y) =>
-        if (x == null || x.links ==null)
+        if (x == null || x.links == null)
          y
-        if (y ==null || y.links ==null)
+        if (y == null || y.links == null)
          x
        if (x.links.length > y.links.length)
          x
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala
@ -1,21 +1,22 @@
-package eu.dnetlib.dhp.sx.graph.ebi
+package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.Oaf
-import eu.dnetlib.dhp.sx.graph.bio
+import eu.dnetlib.dhp.sx.bio.BioDBToOAF
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
+import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
+import BioDBToOAF.EBILinkItem
 import eu.dnetlib.dhp.collection.CollectionUtils
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 object SparkEBILinksToOaf {
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -24,19 +25,20 @@ object SparkEBILinksToOaf {
        .appName(SparkEBILinksToOaf.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    import spark.implicits._
    val sourcePath = parser.get("sourcePath")
    log.info(s"sourcePath  -> $sourcePath")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath  -> $targetPath")
    import spark.implicits._
    implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
-    val ebLinks:Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links!= null)
+    val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
-    ebLinks.flatMap(j =>BioDBToOAF.parse_ebi_links(j.links))
+    ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
      .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
      .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
      .flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null)
      .write.mode(SaveMode.Overwrite).save(targetPath)
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
@ -0,0 +1,253 @@
 package eu.dnetlib.dhp.sx.bio.pubmed;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 /**
 * This class represent an instance of Pubmed Article extracted from the native XML
 *
 * @author Sandro La Bruzzo
 */
 public class PMArticle implements Serializable {
 	/**
 	 * the Pubmed Identifier
 	 */
 	private String pmid;
 	/**
 	 * the DOI
 	 */
 	private String doi;
 	/**
 	 * the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
 	 * All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
 	 */
 	private String date;
 	/**
 	 * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
 	 */
 	private PMJournal journal;
 	/**
 	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
 	 */
 	private String title;
 	/**
 	 * English-language abstracts are taken directly from the published article.
 	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
 	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
 	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
 	 */
 	private String description;
 	/**
 	 * the language in which an article was published is recorded in <Language>.
 	 * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
 	 * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
 	 *  Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
 	 */
 	private String language;
 	/**
 	 * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations.	 *
 	 */
 	private final List<PMSubject> subjects = new ArrayList<>();
 	/**
 	 * This element is used to identify the type of article indexed for MEDLINE;
 	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
 	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
 	 */
 	private final List<PMSubject> publicationTypes = new ArrayList<>();
 	/**
 	 * Personal and collective (corporate) author names published with the article are found in <AuthorList>.
 	 */
 	private List<PMAuthor> authors = new ArrayList<>();
 	/**
 	 * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
 	 * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
 	 */
 	private final List<PMGrant> grants = new ArrayList<>();
 	/**
 	 * get the DOI
 	 * @return a DOI
 	 */
 	public String getDoi() {
 		return doi;
 	}
 	/**
 	 * Set the DOI
 	 * @param doi a DOI
 	 */
 	public void setDoi(String doi) {
 		this.doi = doi;
 	}
 	/**
 	 * get the Pubmed Identifier
 	 * @return the PMID
 	 */
 	public String getPmid() {
 		return pmid;
 	}
 	/**
 	 * set the Pubmed Identifier
 	 * @param pmid the Pubmed Identifier
 	 */
 	public void setPmid(String pmid) {
 		this.pmid = pmid;
 	}
 	/**
 	 * the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
 	 * All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
 	 *
 	 * @return the Pubmed Date
 	 */
 	public String getDate() {
 		return date;
 	}
 	/**
 	 * Set the pubmed Date
 	 * @param date
 	 */
 	public void setDate(String date) {
 		this.date = date;
 	}
 	/**
 	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
 		 * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
 	 * The NLM journal title abbreviation is exported in the <MedlineTA> element.
 	 *
 	 * @return the pubmed Journal Extracted
 	 */
 	public PMJournal getJournal() {
 		return journal;
 	}
 	/**
 	 * Set the mapped pubmed Journal
 	 * @param journal
 	 */
 	public void setJournal(PMJournal journal) {
 		this.journal = journal;
 	}
 	/**
 	 * English-language abstracts are taken directly from the published article.
 	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
 	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
 	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
 	 *
 	 *  @return the extracted pubmed Title
 	 */
 	public String getTitle() {
 		return title;
 	}
 	/**
 	 * set the pubmed title
 	 * @param title
 	 */
 	public void setTitle(String title) {
 		this.title = title;
 	}
 	/**
 	 * English-language abstracts are taken directly from the published article.
 	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
 	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
 	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
 	 *
 	 * @return the Mapped Pubmed Article Abstracts
 	 */
 	public String getDescription() {
 		return description;
 	}
 	/**
 	 * Set the Mapped Pubmed Article Abstracts
 	 * @param description
 	 */
 	public void setDescription(String description) {
 		this.description = description;
 	}
 	/**
 	 * Personal and collective (corporate) author names published with the article are found in <AuthorList>.
 	 *
 	 * @return get the Mapped Authors lists
 	 */
 	public List<PMAuthor> getAuthors() {
 		return authors;
 	}
 	/**
 	 * Set the Mapped Authors lists
 	 * @param authors
 	 */
 	public void setAuthors(List<PMAuthor> authors) {
 		this.authors = authors;
 	}
 	/**
 	 * This element is used to identify the type of article indexed for MEDLINE;
 	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
 	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
 	 *
 	 * @return the mapped Subjects
 	 */
 	public List<PMSubject> getSubjects() {
 		return subjects;
 	}
 	/**
 	 *
 	 * the language in which an article was published is recorded in <Language>.
 	 * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
 	 * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
 	 *  Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
 	 *
 	 * @return The mapped Language
 	 */
 	public String getLanguage() {
 		return language;
 	}
 	/**
 	 *
 	 *  Set The mapped Language
 	 *
 	 * @param language the mapped Language
 	 */
 	public void setLanguage(String language) {
 		this.language = language;
 	}
 	/**
 	 *  This element is used to identify the type of article indexed for MEDLINE;
 	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
 	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
 	 *
 	 * @return the mapped Publication Type
 	 */
 	public List<PMSubject> getPublicationTypes() {
 		return publicationTypes;
 	}
 	/**
 	 * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
 	 * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
 	 * @return the mapped grants
 	 */
 	public List<PMGrant> getGrants() {
 		return grants;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java
@ -1,29 +1,59 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetlib.dhp.sx.bio.pubmed;
 import java.io.Serializable;
 /**
 * The type Pubmed author.
 *
 * @author Sandro La Bruzzo
 */
 public class PMAuthor implements Serializable {
 	private String lastName;
 	private String foreName;
 	/**
 	 * Gets last name.
 	 *
 	 * @return the last name
 	 */
 	public String getLastName() {
 		return lastName;
 	}
 	/**
 	 * Sets last name.
 	 *
 	 * @param lastName the last name
 	 */
 	public void setLastName(String lastName) {
 		this.lastName = lastName;
 	}
 	/**
 	 * Gets fore name.
 	 *
 	 * @return the fore name
 	 */
 	public String getForeName() {
 		return foreName;
 	}
 	/**
 	 * Sets fore name.
 	 *
 	 * @param foreName the fore name
 	 */
 	public void setForeName(String foreName) {
 		this.foreName = foreName;
 	}
 	/**
 	 * Gets full name.
 	 *
 	 * @return the full name
 	 */
 	public String getFullName() {
 		return String
 			.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java
@ -0,0 +1,87 @@
 package eu.dnetlib.dhp.sx.bio.pubmed;
 /**
 * The type Pm grant.
 *
 * @author Sandro La Bruzzo
 */
 public class PMGrant {
 	private String grantID;
 	private String agency;
 	private String country;
 	/**
 	 * Instantiates a new Pm grant.
 	 */
 	public PMGrant() {
 	}
 	/**
 	 * Instantiates a new Pm grant.
 	 *
 	 * @param grantID the grant id
 	 * @param agency  the agency
 	 * @param country the country
 	 */
 	public PMGrant(String grantID, String agency, String country) {
 		this.grantID = grantID;
 		this.agency = agency;
 		this.country = country;
 	}
 	/**
 	 * Gets grant id.
 	 *
 	 * @return the grant id
 	 */
 	public String getGrantID() {
 		return grantID;
 	}
 	/**
 	 * Sets grant id.
 	 *
 	 * @param grantID the grant id
 	 */
 	public void setGrantID(String grantID) {
 		this.grantID = grantID;
 	}
 	/**
 	 * Gets agency.
 	 *
 	 * @return the agency
 	 */
 	public String getAgency() {
 		return agency;
 	}
 	/**
 	 * Sets agency.
 	 *
 	 * @param agency the agency
 	 */
 	public void setAgency(String agency) {
 		this.agency = agency;
 	}
 	/**
 	 * Gets country.
 	 *
 	 * @return the country
 	 */
 	public String getCountry() {
 		return country;
 	}
 	/**
 	 * Sets country.
 	 *
 	 * @param country the country
 	 */
 	public void setCountry(String country) {
 		this.country = country;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java
@ -1,8 +1,13 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetlib.dhp.sx.bio.pubmed;
 import java.io.Serializable;
 /**
 * The type Pm journal.
 *
 * @author  Sandro La Bruzzo
 */
 public class PMJournal implements Serializable {
 	private String issn;
@ -11,42 +16,92 @@ public class PMJournal implements Serializable {
 	private String date;
 	private String title;
 	/**
 	 * Gets issn.
 	 *
 	 * @return the issn
 	 */
 	public String getIssn() {
 		return issn;
 	}
 	/**
 	 * Sets issn.
 	 *
 	 * @param issn the issn
 	 */
 	public void setIssn(String issn) {
 		this.issn = issn;
 	}
 	/**
 	 * Gets volume.
 	 *
 	 * @return the volume
 	 */
 	public String getVolume() {
 		return volume;
 	}
 	/**
 	 * Sets volume.
 	 *
 	 * @param volume the volume
 	 */
 	public void setVolume(String volume) {
 		this.volume = volume;
 	}
 	/**
 	 * Gets issue.
 	 *
 	 * @return the issue
 	 */
 	public String getIssue() {
 		return issue;
 	}
 	/**
 	 * Sets issue.
 	 *
 	 * @param issue the issue
 	 */
 	public void setIssue(String issue) {
 		this.issue = issue;
 	}
 	/**
 	 * Gets date.
 	 *
 	 * @return the date
 	 */
 	public String getDate() {
 		return date;
 	}
 	/**
 	 * Sets date.
 	 *
 	 * @param date the date
 	 */
 	public void setDate(String date) {
 		this.date = date;
 	}
 	/**
 	 * Gets title.
 	 *
 	 * @return the title
 	 */
 	public String getTitle() {
 		return title;
 	}
 	/**
 	 * Sets title.
 	 *
 	 * @param title the title
 	 */
 	public void setTitle(String title) {
 		this.title = title;
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala
@ -1,7 +1,13 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed
+package eu.dnetlib.dhp.sx.bio.pubmed
 import scala.xml.MetaData
 import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
 /**
 *
 * @param xml
 */
 class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
  var currentArticle:PMArticle = generateNextArticle()
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java
@ -1,40 +1,83 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetlib.dhp.sx.bio.pubmed;
 /**
 * The type Pubmed subject.
 */
 public class PMSubject {
 	private String value;
 	private String meshId;
 	private String registryNumber;
 	/**
 	 * Instantiates a new Pm subject.
 	 */
 	public PMSubject() {
 	}
 	/**
 	 * Instantiates a new Pm subject.
 	 *
 	 * @param value          the value
 	 * @param meshId         the mesh id
 	 * @param registryNumber the registry number
 	 */
 	public PMSubject(String value, String meshId, String registryNumber) {
 		this.value = value;
 		this.meshId = meshId;
 		this.registryNumber = registryNumber;
 	}
 	/**
 	 * Gets value.
 	 *
 	 * @return the value
 	 */
 	public String getValue() {
 		return value;
 	}
 	/**
 	 * Sets value.
 	 *
 	 * @param value the value
 	 */
 	public void setValue(String value) {
 		this.value = value;
 	}
 	/**
 	 * Gets mesh id.
 	 *
 	 * @return the mesh id
 	 */
 	public String getMeshId() {
 		return meshId;
 	}
 	/**
 	 * Sets mesh id.
 	 *
 	 * @param meshId the mesh id
 	 */
 	public void setMeshId(String meshId) {
 		this.meshId = meshId;
 	}
 	/**
 	 * Gets registry number.
 	 *
 	 * @return the registry number
 	 */
 	public String getRegistryNumber() {
 		return registryNumber;
 	}
 	/**
 	 * Sets registry number.
 	 *
 	 * @param registryNumber the registry number
 	 */
 	public void setRegistryNumber(String registryNumber) {
 		this.registryNumber = registryNumber;
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
@ -1,12 +1,16 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed
+package eu.dnetlib.dhp.sx.bio.pubmed
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf._
 import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
-
+import eu.dnetlib.dhp.schema.oaf._
 import java.util.regex.Pattern
 import scala.collection.JavaConverters._
 import java.util.regex.Pattern
 /**
 *
 */
 object PubMedToOaf {
  val SUBJ_CLASS = "keywords"
@ -14,8 +18,18 @@ object PubMedToOaf {
    "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
    "doi" -> "https://dx.doi.org/"
  )
  val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
  val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
-  def cleanDoi(doi:String):String = {
+
  /**
   * Cleaning the DOI Applying regex in order to
   * remove doi starting with URL
   * @param doi input DOI
   * @return cleaned DOI
   */
  def cleanDoi(doi: String): String = {
    val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
@ -29,6 +43,15 @@ object PubMedToOaf {
    null
  }
  /**
   *
   * Create an instance of class extends Result
   * starting from OAF instanceType value
   *
   * @param cobjQualifier OAF instance type
   * @param vocabularies All dnet vocabularies
   * @return the correct instance
   */
  def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
    val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
    result_typologies.getClassid match {
@ -41,6 +64,12 @@ object PubMedToOaf {
    }
  }
  /**
   *  Mapping the Pubmedjournal info into the OAF Journale
   *
   * @param j the pubmedJournal
   * @return the OAF Journal
   */
  def mapJournal(j: PMJournal): Journal = {
    if (j == null)
      return null
@ -48,6 +77,7 @@ object PubMedToOaf {
    journal.setDataInfo(dataInfo)
    journal.setName(j.getTitle)
    journal.setConferencedate(j.getDate)
    journal.setVol(j.getVolume)
    journal.setIssnPrinted(j.getIssn)
    journal.setIss(j.getIssue)
@ -56,69 +86,108 @@ object PubMedToOaf {
  }
-
+  /**
   *
   * Find vocabulary term into synonyms and term in the vocabulary
   *
   * @param vocabularyName the input vocabulary name
   * @param vocabularies all the vocabularies
   * @param term the term to find
   *
   * @return the cleaned term value
   */
  def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
    val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
    val b = vocabularies.getTermAsQualifier(vocabularyName, term)
    if (a == null) b else a
  }
  val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
  val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
  /**
   *  Map the Pubmed Article into the OAF instance
   *
   *
   * @param article the pubmed articles
   * @param vocabularies the vocabularies
   * @return The OAF instance if the mapping did not fail
   */
  def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
    if (article.getPublicationTypes == null)
      return null
-    val i = new Instance
+
-    var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
+
    // MAP PMID into  pid with  classid = classname = pmid
    val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
    if (pidList == null)
      return null
-    var alternateIdentifier :StructuredProperty = null
+    // MAP //ArticleId[./@IdType="doi"]   into  alternateIdentifier with classid = classname = doi
    var alternateIdentifier: StructuredProperty = null
    if (article.getDoi != null) {
      val normalizedPid = cleanDoi(article.getDoi)
-      if (normalizedPid!= null)
+      if (normalizedPid != null)
        alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
    }
    // INSTANCE MAPPING
    //--------------------------------------------------------------------------------------
    // If the article contains the typology Journal Article then we apply this type
    //else We have to find a terms that match the vocabulary otherwise we discard it
    val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
    val pubmedInstance = new Instance
    if (ja.isDefined) {
      val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
-      i.setInstancetype(cojbCategory)
+      pubmedInstance.setInstancetype(cojbCategory)
    } else {
      val i_type = article.getPublicationTypes.asScala
        .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
        .find(q => q != null)
      if (i_type.isDefined)
-        i.setInstancetype(i_type.get)
+        pubmedInstance.setInstancetype(i_type.get)
      else
        return null
    }
-    val result = createResult(i.getInstancetype, vocabularies)
+    val result = createResult(pubmedInstance.getInstancetype, vocabularies)
    if (result == null)
      return result
    result.setDataInfo(dataInfo)
-    i.setPid(pidList.asJava)
+    pubmedInstance.setPid(pidList.asJava)
-    if (alternateIdentifier!= null)
+    if (alternateIdentifier != null)
-      i.setAlternateIdentifier(List(alternateIdentifier).asJava)
+      pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
-    result.setInstance(List(i).asJava)
+    result.setInstance(List(pubmedInstance).asJava)
-    i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut)
+    pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
    //CREATE URL From pmid
    val urlLists: List[String] = pidList
      .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
      .filter(t => t._1.nonEmpty)
      .map(t => t._1 + t._2)
    if (urlLists != null)
-      i.setUrl(urlLists.asJava)
+      pubmedInstance.setUrl(urlLists.asJava)
-    i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
+
-    i.setCollectedfrom(collectedFrom)
+    //ASSIGN DateofAcceptance
    pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
    //ASSIGN COLLECTEDFROM
    pubmedInstance.setCollectedfrom(collectedFrom)
    result.setPid(pidList.asJava)
    //END INSTANCE MAPPING
    //--------------------------------------------------------------------------------------
    // JOURNAL MAPPING
    //--------------------------------------------------------------------------------------
    if (article.getJournal != null && result.isInstanceOf[Publication])
      result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal))
    result.setCollectedfrom(List(collectedFrom).asJava)
    //END JOURNAL MAPPING
    //--------------------------------------------------------------------------------------
    // RESULT MAPPING
    //--------------------------------------------------------------------------------------
    result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
    if (article.getTitle == null || article.getTitle.isEmpty)
@ -136,7 +205,7 @@ object PubMedToOaf {
    }
-    val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection breakOut)
+    val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
    if (subjects != null)
      result.setSubject(subjects.asJava)
@ -148,7 +217,7 @@ object PubMedToOaf {
      author.setFullname(a.getFullName)
      author.setRank(index + 1)
      author
-    }(collection breakOut)
+    }(collection.breakOut)
    if (authors != null && authors.nonEmpty)
@ -158,6 +227,9 @@ object PubMedToOaf {
    result.setId(article.getPmid)
    // END RESULT MAPPING
    //--------------------------------------------------------------------------------------
    val id = IdentifierFactory.createIdentifier(result)
    if (article.getPmid.equalsIgnoreCase(id))
      return null
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json
@ -0,0 +1,33 @@
 [
  {
    "paramName":"s",
    "paramLongName":"sourcePath",
    "paramDescription": "the path of the sequencial file to read",
    "paramRequired": true
  },
  {
    "paramName":"out",
    "paramLongName":"outputPath",
    "paramDescription": "the output path",
    "paramRequired": true
  },
  {
    "paramName": "ssm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "true if the spark session is managed, false otherwise",
    "paramRequired": false
  },
  {
    "paramName": "hnn",
    "paramLongName": "hdfsNameNode",
    "paramDescription": "the path used to store the HostedByMap",
    "paramRequired": true
  },
  {
    "paramName": "cfn",
    "paramLongName": "classForName",
    "paramDescription": "the path used to store the HostedByMap",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml
@ -0,0 +1,30 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>hiveJdbcUrl</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
    <property>
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml
@ -0,0 +1,174 @@
 <workflow-app name="UnresolvedEntities" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>fosPath</name>
            <description>the input path of the resources to be extended</description>
        </property>
        <property>
            <name>bipScorePath</name>
            <description>the path where to find the bipFinder scores</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="prepareInfo"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <fork name="prepareInfo">
        <path start="prepareBip"/>
        <path start="getFOS"/>
    </fork>
    <action name="prepareBip">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the unresolved from bip finder!</name>
            <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${bipScorePath}</arg>
            <arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
        </spark>
        <ok to="join"/>
        <error to="Kill"/>
    </action>
    <action name="getFOS">
        <java>
            <main-class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSData</main-class>
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
            <arg>--sourcePath</arg><arg>${fosPath}</arg>
            <arg>--outputPath</arg><arg>${workingDir}/input/fos</arg>
            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel</arg>
        </java>
        <ok to="prepareFos"/>
        <error to="Kill"/>
    </action>
    <action name="prepareFos">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the unresolved from FOS!</name>
            <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/input/fos</arg>
            <arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
        </spark>
        <ok to="join"/>
        <error to="Kill"/>
    </action>
    <join name="join" to="produceUnresolved"/>
    <action name="produceUnresolved">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Saves the result produced for bip and fos by grouping results with the same id</name>
            <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.SparkSaveUnresolved</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/prepared</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json
@ -0,0 +1,20 @@
 [
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "sp",
    "paramLongName": "sourcePath",
    "paramDescription": "the URL from where to get the programme file",
    "paramRequired": true
  },
  {
    "paramName": "o",
    "paramLongName": "outputPath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json
@ -0,0 +1,20 @@
 [
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "sp",
    "paramLongName": "sourcePath",
    "paramDescription": "the URL from where to get the programme file",
    "paramRequired": true
  },
  {
    "paramName": "o",
    "paramLongName": "outputPath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
@ -1,95 +0,0 @@
 <workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>mainPath</name>
            <description>the working path of Datacite stores</description>
        </property>
        <property>
            <name>oafTargetPath</name>
            <description>the target path where the OAF records are stored</description>
        </property>
        <property>
            <name>isLookupUrl</name>
            <description>The IS lookUp service endopoint</description>
        </property>
        <property>
            <name>blocksize</name>
            <value>100</value>
            <description>The request block size</description>
        </property>
        <property>
            <name>exportLinks</name>
            <value>false</value>
            <description>instructs the transformation phase to produce the links or not</description>
        </property>
    </parameters>
    <start to="resume_from"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <decision name="resume_from">
        <switch>
            <case to="TransformDatacite">${wf:conf('resumeFrom') eq 'TransformDatacite'}</case>
            <default to="ImportDatacite"/>
        </switch>
    </decision>
    <action name="ImportDatacite">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>ImportDatacite</name>
            <class>eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
            <arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
            <arg>--namenode</arg><arg>${nameNode}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
            <arg>--blocksize</arg><arg>${blocksize}</arg>
        </spark>
        <ok to="TransformDatacite"/>
        <error to="Kill"/>
    </action>
    <action name="TransformDatacite">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>TransformJob</name>
            <class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
            <arg>--targetPath</arg><arg>${oafTargetPath}</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--exportLinks</arg><arg>${exportLinks}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml
@ -1,84 +0,0 @@
 <workflow-app name="Generate_Datacite_and_Crossref_dump_for_Scholexplorer" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>datacitePath</name>
            <description>the path of Datacite spark dataset</description>
        </property>
        <property>
            <name>isLookupUrl</name>
            <description>The IS lookUp service endopoint</description>
        </property>
        <property>
            <name>crossrefPath</name>
            <description>the path of Crossref spark dataset</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the path of Crossref spark dataset</description>
        </property>
    </parameters>
    <start to="ImportDatacite"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ImportDatacite">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>ImportDatacite</name>
            <class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${datacitePath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}/datacite_oaf</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--exportLinks</arg><arg>true</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="FilterCrossrefEntities"/>
        <error to="Kill"/>
    </action>
    <action name="FilterCrossrefEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>FilterCrossrefEntities</name>
            <class>eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${crossrefPath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}/crossref_oaf</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json
@ -0,0 +1,25 @@
 [
  {
    "paramName": "ip",
    "paramLongName": "inputPath",
    "paramDescription": "the zipped opencitations file",
    "paramRequired": true
  },
  {
    "paramName": "op",
    "paramLongName": "outputPath",
    "paramDescription": "the working path",
    "paramRequired": true
  },
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  },  {
  "paramName": "sdr",
  "paramLongName": "shouldDuplicateRels",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json
@ -0,0 +1,20 @@
 [
  {
    "paramName": "if",
    "paramLongName": "inputFile",
    "paramDescription": "the zipped opencitations file",
    "paramRequired": true
  },
  {
    "paramName": "wp",
    "paramLongName": "workingPath",
    "paramDescription": "the working path",
    "paramRequired": true
  },
  {
    "paramName": "hnn",
    "paramLongName": "hdfsNameNode",
    "paramDescription": "the hdfs name node",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml
@ -1,7 +1,4 @@
 <configuration>
    <!-- OCEAN  -->
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
@ -10,41 +7,6 @@
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <!-- GARR  -->
 <!--    <property>-->
 <!--        <name>jobTracker</name>-->
 <!--        <value>yarn</value>-->
 <!--    </property>-->
 <!--    <property>-->
 <!--        <name>nameNode</name>-->
 <!--        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
 <!--    </property>-->
 <!--    <property>-->
 <!--        <name>hive_metastore_uris</name>-->
 <!--        <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
 <!--    </property>-->
 <!--    <property>-->
 <!--        <name>spark2YarnHistoryServerAddress</name>-->
 <!--        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
 <!--    </property>-->
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
@ -53,16 +15,44 @@
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <property>
        <name>spark2ExtraListeners</name>
        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
    </property>
    <property>
        <name>spark2SqlQueryExecutionListeners</name>
        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
    <property>
        <name>sparkExecutorNumber</name>
        <value>4</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
    <property>
-        <name>spark2ExtraListeners</name>
+        <name>sparkDriverMemory</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
+        <value>15G</value>
    </property>
    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
+        <name>sparkExecutorMemory</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
+        <value>6G</value>
    </property>
    <property>
        <name>sparkExecutorCores</name>
        <value>1</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh
@ -0,0 +1,2 @@
 #!/bin/bash
 for file in $(echo $1 | tr ";" "\n"); do curl -L $(echo $file | cut -d '@' -f 1 ) | hdfs dfs -put - $2/$(echo $file | cut -d '@' -f 2)  ; done;
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
@ -0,0 +1,91 @@
 <workflow-app name="OpenCitations Integration" xmlns="uri:oozie:workflow:0.5">
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="resume_from"/>
    <decision name="resume_from">
        <switch>
            <case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
            <case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
            <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
        </switch>
    </decision>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="download">
        <shell xmlns="uri:oozie:shell-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <exec>download.sh</exec>
            <argument>${filelist}</argument>
            <argument>${workingPath}/Original</argument>
            <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
            <file>download.sh</file>
            <capture-output/>
        </shell>
        <ok to="extract"/>
        <error to="Kill"/>
    </action>
    <action name="extract">
        <java>
            <main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
            <arg>--inputFile</arg><arg>${inputFile}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
        </java>
        <ok to="create_actionset"/>
        <error to="Kill"/>
    </action>
    <action name="create_actionset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Produces the AS for OC</name>
            <class>eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json
@ -0,0 +1,8 @@
 [
  {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
  {"paramName":"f",   "paramLongName":"opencitationFile",	"paramDescription": "the name of the file",	"paramRequired": true},
  {"paramName":"issm",   "paramLongName":"isSparkSessionManaged",	"paramDescription": "the name of the activities orcid file",	"paramRequired": false},
  {"paramName":"o",   "paramLongName":"outputPath",	"paramDescription": "the name of the activities orcid file",	"paramRequired": true}
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml
@ -1,46 +1,52 @@
-<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Collect_Datacite" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
-            <name>sourcePath</name>
+            <name>mainPath</name>
            <description>the working path of Datacite stores</description>
        </property>
        <property>
-            <name>outputPath</name>
+            <name>isLookupUrl</name>
-            <description>the path of Datacite ActionSet</description>
+            <description>The IS lookUp service endopoint</description>
        </property>
        <property>
            <name>blocksize</name>
            <value>100</value>
            <description>The request block size</description>
        </property>
    </parameters>
-    <start to="ExportDataset"/>
+    <start to="ImportDatacite"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    <action name="ExportDataset">
+    <action name="ImportDatacite">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
-            <name>ExportDataset</name>
+            <name>ImportDatacite</name>
-            <class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
+            <class>eu.dnetlib.dhp.datacite.ImportDatacite</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
-            <arg>--targetPath</arg><arg>${outputPath}</arg>
+            <arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
            <arg>--namenode</arg><arg>${nameNode}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
            <arg>--blocksize</arg><arg>${blocksize}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
@ -7,8 +7,8 @@
  },
  {
-    "paramName": "t",
+    "paramName": "mo",
-    "paramLongName": "targetPath",
+    "paramLongName": "mdstoreOutputVersion",
    "paramDescription": "the target mdstore path",
    "paramRequired": true
  },
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml
@ -0,0 +1,126 @@
 <workflow-app name="transform_Datacite" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>mainPath</name>
            <description>the working path of Datacite stores</description>
        </property>
        <property>
            <name>isLookupUrl</name>
            <description>The IS lookUp service endopoint</description>
        </property>
        <property>
            <name>mdStoreOutputId</name>
            <description>the identifier of the cleaned MDStore</description>
        </property>
        <property>
            <name>mdStoreManagerURI</name>
            <description>the path of the cleaned mdstore</description>
        </property>
    </parameters>
    <start to="StartTransaction"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="StartTransaction">
        <java>
            <configuration>
                <property>
                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
                    <value>true</value>
                </property>
            </configuration>
            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
            <arg>--action</arg><arg>NEW_VERSION</arg>
            <arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
            <capture-output/>
        </java>
        <ok to="TransformJob"/>
        <error to="EndReadRollBack"/>
    </action>
    <action name="TransformJob">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>TransformJob</name>
            <class>eu.dnetlib.dhp.datacite.GenerateDataciteDatasetSpark</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
            <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--exportLinks</arg><arg>true</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="CommitVersion"/>
        <error to="Kill"/>
    </action>
    <action name="CommitVersion">
        <java>
            <configuration>
                <property>
                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
                    <value>true</value>
                </property>
            </configuration>
            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
            <arg>--action</arg><arg>COMMIT</arg>
            <arg>--namenode</arg><arg>${nameNode}</arg>
            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
        </java>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <action name="EndReadRollBack">
        <java>
            <configuration>
                <property>
                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
                    <value>true</value>
                </property>
            </configuration>
            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
            <arg>--action</arg><arg>READ_UNLOCK</arg>
            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
            <arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
            <capture-output/>
        </java>
        <ok to="RollBack"/>
        <error to="Kill"/>
    </action>
    <action name="RollBack">
        <java>
            <configuration>
                <property>
                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
                    <value>true</value>
                </property>
            </configuration>
            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
            <arg>--action</arg><arg>ROLLBACK</arg>
            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
        </java>
        <ok to="Kill"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/bio/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/bio/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml
@ -0,0 +1,51 @@
    <workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the PDB Database Working Path</description>
        </property>
        <property>
            <name>database</name>
            <description>the PDB Database Working Path</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the Target Working dir path</description>
        </property>
    </parameters>
    <start to="ConvertDB"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ConvertDB">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Convert Bio DB to OAF Dataset</name>
            <class>eu.dnetlib.dhp.sx.bio.SparkTransformBioDatabaseToOAF</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=2000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--dbPath</arg><arg>${sourcePath}</arg>
            <arg>--database</arg><arg>${database}</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json
@ -3,5 +3,6 @@
  {"paramName":"i",   "paramLongName":"isLookupUrl",    "paramDescription": "isLookupUrl",                              "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingPath",    "paramDescription": "the path of the sequencial file to read",  "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath",     "paramDescription": "the oaf path ",                            "paramRequired": true},
  {"paramName":"s",   "paramLongName":"skipUpdate",     "paramDescription": "skip update ",                             "paramRequired": false},
  {"paramName":"h",   "paramLongName":"hdfsServerUri",  "paramDescription": "the  working path ",                       "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/bio/bio_to_oaf_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/bio/bio_to_oaf_params.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
@ -0,0 +1,105 @@
 <workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the Working Path</description>
        </property>
        <property>
            <name>workingPath</name>
            <description>the Working Path</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the OAF MDStore Path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>resumeFrom</name>
            <value>DownloadEBILinks</value>
            <description>node to start</description>
        </property>
    </parameters>
    <start to="resume_from"/>
    <decision name="resume_from">
        <switch>
            <case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
            <case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
            <default to="DownloadEBILinks"/>
        </switch>
    </decision>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="DownloadEBILinks">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Incremental Download EBI Links</name>
            <class>eu.dnetlib.dhp.sx.bio.ebi.SparkDownloadEBILinks</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=2000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="OverrideFolders"/>
        <error to="Kill"/>
    </action>
    <action name="OverrideFolders">
        <fs>
            <delete path="${sourcePath}/ebi_links_dataset_old"/>
            <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
            <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
        </fs>
        <ok to="CreateEBIDataSet"/>
        <error to="Kill"/>
    </action>
    <action name="CreateEBIDataSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create OAF DataSet</name>
            <class>eu.dnetlib.dhp.sx.bio.ebi.SparkEBILinksToOaf</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=2000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/workflow.xml
@ -1,17 +1,22 @@
-    <workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
+    <workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>baselineWorkingPath</name>
            <description>the Baseline Working Path</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the Target Path</description>
        </property>
        <property>
            <name>isLookupUrl</name>
            <description>The IS lookUp service endopoint</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>The target path</description>
        </property>
        <property>
            <name>skipUpdate</name>
            <value>false</value>
            <description>The request block size</description>
        </property>
    </parameters>
    <start to="ConvertDataset"/>
@ -24,9 +29,9 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Convert Baseline to Dataset</name>
+            <name>Convert Baseline to OAF Dataset</name>
-            <class>eu.dnetlib.dhp.sx.graph.ebi.SparkCreateBaselineDataFrame</class>
+            <class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
@ -41,6 +46,8 @@
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
            <arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties
@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
 # A1 uses PatternLayout.
 log4j.appender.A1.layout=org.apache.log4j.PatternLayout
 log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
 log4j.logger.org.apache.spark=FATAL
 log4j.logger.org.spark_project=FATAL
--- a/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md
+++ b/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md
--- a/dhp-workflows/dhp-aggregation/src/site/markdown/index.md
+++ b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md
@ -0,0 +1,20 @@
 ##DHP-Aggregation
 This module defines a set of oozie workflows for
 1. the **collection** and **transformation** of metadata records.
 2. the **integration** of new external information in the result
 ### Collection and Transformation
 The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
 the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
 of each MDStore.
 It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
 ### Integration of external information in the result
 The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph.
 See integration section for more insight
--- a/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md
+++ b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md
@ -0,0 +1,36 @@
 DHP Aggregation - Integration method
 =====================================
 The integration method can be applied every time new information, which is not aggregated from the repositories
 nor computed directly by OpenAIRE, should be added to the results of the graph.
 The information integrated so far is:
 1. Article impact measures
    1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
 2. Result Subjects
    1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS))  classification in
    results subjects.
 The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
 and the element in the OAF model that should be used to map the information we want to integrate.
 The id is set by using a particular encoding of the given PID
 *unresolved:[pid]:[pidtype]*
 where
 1. *unresolved* is a constant value
 2. *pid*  is the persistent id value, e.g. 10.5281/zenodo.4707307
 3. *pidtype* is the persistent id type, e.g. doi
 Such entities are matched against those available in the graph using the result.instance.pid values.
 This mechanism can be used to integrate enrichments produced as associated by a given PID.
 If a match will be found with one of the results already in the graph that said result will be enriched with the information
 present in the new OAF.
 All the objects for which a match is not found are discarded.
--- a/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md
+++ b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md
@ -0,0 +1,7 @@
 ##DHP-Aggregation
 This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
 Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
 the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
 of each MDStore.
--- a/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md
+++ b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md
@ -0,0 +1,18 @@
 DHP Aggregation
 ===============
 DHP-Aggregations contains different mappings from original data format into OAF Data Format, 
 which converge in the graph in different ways:
 - Via Action Manager
 - Direct in the MdStore on Hadoop
 Below the list of the implemented mapping
 Mappings
 =======
 1. [PubMed](pubmed.md)
 2. [Datacite](datacite.md)
--- a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md
+++ b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md
@ -0,0 +1,66 @@
 #Pubmed Mapping
 This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/).
 Collection
 ---------
 The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with 
 the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) 
 Parsing
 -------
 The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates 
 an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
 Mapping
 -------
 The table below describes the mapping from the XML Native to the OAF mapping
 | Xpath Source | Oaf Field  | Notes       |
 | ----------- | ----------- | ----------- |
 | //PMID      | pid         | classid = classname = pmid
 |  | **Instance Mapping** |      |
 |//PublicationType | InstanceType  | If the article contains the typology **Journal Article** then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it
 |//PMID | instance/PID | Map the pmid also in the pid in the instance |
 | //ArticleId[./@IdType="doi"   | instance/alternateIdentifier  |classid = classname = doi
 |//PMID | instance/URL | prepend to the PMId the base url https://pubmed.ncbi.nlm.nih.gov/
 | //PubmedPubDate | instance/Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it
 |  FOR ALL INSTANCE | CollectedFrom | datasourceName: *Europe PubMed Central* DatasourceId:   
 |  | **Journal Mapping** |      |
 |//Journal/PubDate| Journal/Conferencedate | map the date of the Journal
 |//Journal/Title| Journal/Name | |
 |//Journal/Volume| Journal/Vol | |
 |//Journal/ISSN| Journal/issPrinted | |
 |//Journal/Issue| Journal/Iss | |
 |  | **Publication Mapping** |      |
 | //PubmedPubDate | Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it
 | //Title | title | with qualifier ModelConstants.MAIN_TITLE_QUALIFIER
 | //AbstractText | Description ||
 |//Language| Language| cleaning vocabulary -> dnet:languages
 |//DescriptorName| Subject | classId, className = keyword
 |  | **Author Mapping** |      |
 |//Author/LastName| author.Surname| |
 |//Author/ForeName| author.Forename| |
 |//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
 |FOR ALL AUTHOR | author.rank| sequential number starting from 1|
 #TODO
 Missing item mapped
--- a/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png
+++ b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png
--- a/dhp-workflows/dhp-aggregation/src/site/site.xml
+++ b/dhp-workflows/dhp-aggregation/src/site/site.xml
@ -0,0 +1,34 @@
 <?xml version="1.0" encoding="ISO-8859-1"?>
 <project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
         name="DHP-Aggregation">
    <skin>
        <groupId>org.apache.maven.skins</groupId>
        <artifactId>maven-fluido-skin</artifactId>
        <version>1.8</version>
    </skin>
    <poweredBy>
        <logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
              img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
    </poweredBy>
    <body>
        <links>
            <item name="Code" href="https://code-repo.d4science.org/" />
        </links>
        <menu name="Documentation">
            <item name="Mappings" href="mappings.html" collapse="true">
                <item name="Pubmed" href="pubmed.html"/>
                <item name="Datacite" href="datacite.html"/>
            </item>
            <item name="Integration" href="integration.html" collapse="true">
            </item>
            <item name="General Information" href="about.html"/>
            <item name="JavaDoc" href="apidocs/" />
            <item name="ScalaDoc" href="scaladocs/" />
        </menu>
        <menu ref="reports"/>
    </body>
 </project>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java
@ -0,0 +1,250 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import static org.junit.jupiter.api.Assertions.*;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.stream.Collectors;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class PrepareTest {
 	private static final Logger log = LoggerFactory.getLogger(ProduceTest.class);
 	private static Path workingDir;
 	private static SparkSession spark;
 	private static LocalFileSystem fs;
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName());
 		fs = FileSystem.getLocal(new Configuration());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(ProduceTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(PrepareTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void bipPrepareTest() throws Exception {
 		final String sourcePath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
 			.getPath();
 		PrepareBipFinder
 			.main(
 				new String[] {
 					"--isSparkSessionManaged", Boolean.FALSE.toString(),
 					"--sourcePath", sourcePath,
 					"--outputPath", workingDir.toString() + "/work"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Result> tmp = sc
 			.textFile(workingDir.toString() + "/work/bip")
 			.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
 		Assertions.assertEquals(86, tmp.count());
 		String doi1 = "unresolved::10.0000/096020199389707::doi";
 		Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count());
 		Assertions.assertEquals(3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getMeasures().size());
 		Assertions
 			.assertEquals(
 				"6.34596412687e-09", tmp
 					.filter(r -> r.getId().equals(doi1))
 					.collect()
 					.get(0)
 					.getMeasures()
 					.stream()
 					.filter(sl -> sl.getId().equals("influence"))
 					.collect(Collectors.toList())
 					.get(0)
 					.getUnit()
 					.get(0)
 					.getValue());
 		Assertions
 			.assertEquals(
 				"0.641151896994", tmp
 					.filter(r -> r.getId().equals(doi1))
 					.collect()
 					.get(0)
 					.getMeasures()
 					.stream()
 					.filter(sl -> sl.getId().equals("popularity_alt"))
 					.collect(Collectors.toList())
 					.get(0)
 					.getUnit()
 					.get(0)
 					.getValue());
 		Assertions
 			.assertEquals(
 				"2.33375102921e-09", tmp
 					.filter(r -> r.getId().equals(doi1))
 					.collect()
 					.get(0)
 					.getMeasures()
 					.stream()
 					.filter(sl -> sl.getId().equals("popularity"))
 					.collect(Collectors.toList())
 					.get(0)
 					.getUnit()
 					.get(0)
 					.getValue());
 	}
 	@Test
 	void getFOSFileTest() throws IOException, ClassNotFoundException {
 		final String sourcePath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv")
 			.getPath();
 		final String outputPath = workingDir.toString() + "/fos.json";
 		new GetFOSData()
 			.doRewrite(
 				sourcePath, outputPath, "eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel",
 				'\t', fs);
 		BufferedReader in = new BufferedReader(
 			new InputStreamReader(fs.open(new org.apache.hadoop.fs.Path(outputPath))));
 		String line;
 		int count = 0;
 		while ((line = in.readLine()) != null) {
 			FOSDataModel fos = new ObjectMapper().readValue(line, FOSDataModel.class);
 			System.out.println(new ObjectMapper().writeValueAsString(fos));
 			count += 1;
 		}
 		assertEquals(38, count);
 	}
 	@Test
 	void fosPrepareTest() throws Exception {
 		final String sourcePath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
 			.getPath();
 		PrepareFOSSparkJob
 			.main(
 				new String[] {
 					"--isSparkSessionManaged", Boolean.FALSE.toString(),
 					"--sourcePath", sourcePath,
 					"-outputPath", workingDir.toString() + "/work"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Result> tmp = sc
 			.textFile(workingDir.toString() + "/work/fos")
 			.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
 		String doi1 = "unresolved::10.3390/s18072310::doi";
 		assertEquals(50, tmp.count());
 		assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
 		assertTrue(
 			tmp
 				.filter(r -> r.getId().equals(doi1))
 				.flatMap(r -> r.getSubject().iterator())
 				.map(sbj -> sbj.getValue())
 				.collect()
 				.contains("engineering and technology"));
 		assertTrue(
 			tmp
 				.filter(r -> r.getId().equals(doi1))
 				.flatMap(r -> r.getSubject().iterator())
 				.map(sbj -> sbj.getValue())
 				.collect()
 				.contains("nano-technology"));
 		assertTrue(
 			tmp
 				.filter(r -> r.getId().equals(doi1))
 				.flatMap(r -> r.getSubject().iterator())
 				.map(sbj -> sbj.getValue())
 				.collect()
 				.contains("nanoscience & nanotechnology"));
 		String doi = "unresolved::10.1111/1365-2656.12831::doi";
 		assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
 		assertTrue(
 			tmp
 				.filter(r -> r.getId().equals(doi))
 				.flatMap(r -> r.getSubject().iterator())
 				.map(sbj -> sbj.getValue())
 				.collect()
 				.contains("psychology and cognitive sciences"));
 		assertTrue(
 			tmp
 				.filter(r -> r.getId().equals(doi))
 				.flatMap(r -> r.getSubject().iterator())
 				.map(sbj -> sbj.getValue())
 				.collect()
 				.contains("social sciences"));
 		assertFalse(
 			tmp
 				.filter(r -> r.getId().equals(doi))
 				.flatMap(r -> r.getSubject().iterator())
 				.map(sbj -> sbj.getValue())
 				.collect()
 				.contains("NULL"));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -0,0 +1,234 @@
 package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import java.util.stream.Collectors;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.*;
 public class ProduceTest {
 	private static final Logger log = LoggerFactory.getLogger(ProduceTest.class);
 	private static Path workingDir;
 	private static SparkSession spark;
 	private static LocalFileSystem fs;
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String ID_PREFIX = "50|doi_________";
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(ProduceTest.class.getSimpleName());
 		fs = FileSystem.getLocal(new Configuration());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(ProduceTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(ProduceTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void produceTest() throws Exception {
 		final String bipPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
 			.getPath();
 		PrepareBipFinder
 			.main(
 				new String[] {
 					"--isSparkSessionManaged", Boolean.FALSE.toString(),
 					"--sourcePath", bipPath,
 					"--outputPath", workingDir.toString() + "/work"
 				});
 		final String fosPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
 			.getPath();
 		PrepareFOSSparkJob
 			.main(
 				new String[] {
 					"--isSparkSessionManaged", Boolean.FALSE.toString(),
 					"--sourcePath", fosPath,
 					"-outputPath", workingDir.toString() + "/work"
 				});
 		SparkSaveUnresolved.main(new String[] {
 			"--isSparkSessionManaged", Boolean.FALSE.toString(),
 			"--sourcePath", workingDir.toString() + "/work",
 			"-outputPath", workingDir.toString() + "/unresolved"
 		});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Result> tmp = sc
 			.textFile(workingDir.toString() + "/unresolved")
 			.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
 		Assertions.assertEquals(135, tmp.count());
 		Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")).count());
 		Assertions
 			.assertEquals(
 				3, tmp
 					.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertEquals(
 				3, tmp
 					.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
 					.collect()
 					.get(0)
 					.getMeasures()
 					.size());
 		List<StructuredProperty> sbjs = tmp
 			.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
 			.flatMap(row -> row.getSubject().iterator())
 			.collect();
 		sbjs.forEach(sbj -> Assertions.assertEquals("FOS", sbj.getQualifier().getClassid()));
 		sbjs
 			.forEach(
 				sbj -> Assertions
 					.assertEquals(
 						"Fields of Science and Technology classification", sbj.getQualifier().getClassname()));
 		sbjs
 			.forEach(
 				sbj -> Assertions
 					.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid()));
 		sbjs
 			.forEach(
 				sbj -> Assertions
 					.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename()));
 		sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
 		sbjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
 		sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible()));
 		sbjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
 		sbjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
 		sbjs
 			.forEach(
 				sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid()));
 		sbjs
 			.forEach(
 				sbj -> Assertions
 					.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
 		sbjs
 			.forEach(
 				sbj -> Assertions
 					.assertEquals(
 						ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
 		sbjs
 			.forEach(
 				sbj -> Assertions
 					.assertEquals(
 						ModelConstants.DNET_PROVENANCE_ACTIONS,
 						sbj.getDataInfo().getProvenanceaction().getSchemename()));
 		sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("engineering and technology"));
 		sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nano-technology"));
 		sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nanoscience & nanotechnology"));
 		List<Measure> measures = tmp
 			.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
 			.flatMap(row -> row.getMeasures().iterator())
 			.collect();
 		Assertions
 			.assertEquals(
 				"7.5597134689e-09", measures
 					.stream()
 					.filter(mes -> mes.getId().equals("influence"))
 					.collect(Collectors.toList())
 					.get(0)
 					.getUnit()
 					.get(0)
 					.getValue());
 		Assertions
 			.assertEquals(
 				"4.903880192", measures
 					.stream()
 					.filter(mes -> mes.getId().equals("popularity_alt"))
 					.collect(Collectors.toList())
 					.get(0)
 					.getUnit()
 					.get(0)
 					.getValue());
 		Assertions
 			.assertEquals(
 				"1.17977512835e-08", measures
 					.stream()
 					.filter(mes -> mes.getId().equals("popularity"))
 					.collect(Collectors.toList())
 					.get(0)
 					.getUnit()
 					.get(0)
 					.getValue());
 		Assertions
 			.assertEquals(
 				49, tmp
 					.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi"))
 					.filter(row -> row.getSubject() != null)
 					.count());
 		Assertions
 			.assertEquals(
 				85,
 				tmp
 					.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi"))
 					.filter(r -> r.getMeasures() != null)
 					.count());
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
@ -1,57 +0,0 @@
 package eu.dnetlib.dhp.actionmanager.datacite
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.databind.SerializationFeature
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import org.junit.jupiter.api.extension.ExtendWith
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
 import java.text.SimpleDateFormat
 import java.util.Locale
 import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
 class DataciteToOAFTest extends  AbstractVocabularyTest{
  @BeforeEach
  def setUp() :Unit = {
    super.setUpVocabulary()
  }
  @Test
  def testDateMapping:Unit = {
    val inputDate = "2021-07-14T11:52:54+0000"
    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
    val dt = ISO8601FORMAT.parse(inputDate)
    println(dt.getTime)
  }
  @Test
  def testMapping() :Unit = {
    val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
    val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
    val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
    res.foreach(r => {
      println (mapper.writeValueAsString(r))
      println("----------------------------")
    })
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java
@ -0,0 +1,335 @@
 package eu.dnetlib.dhp.actionmanager.opencitations;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 public class CreateOpenCitationsASTest {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	private static final Logger log = LoggerFactory
 		.getLogger(CreateOpenCitationsASTest.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files
 			.createTempDirectory(CreateOpenCitationsASTest.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(CreateOpenCitationsASTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(CreateOpenCitationsASTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void testNumberofRelations() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-shouldDuplicateRels",
 					Boolean.TRUE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		assertEquals(60, tmp.count());
 		// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
 	}
 	@Test
 	void testNumberofRelations2() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		assertEquals(44, tmp.count());
 		// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
 	}
 	@Test
 	void testRelationsCollectedFrom() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		tmp.foreach(r -> {
 			assertEquals(ModelConstants.OPENOCITATIONS_NAME, r.getCollectedfrom().get(0).getValue());
 			assertEquals(ModelConstants.OPENOCITATIONS_ID, r.getCollectedfrom().get(0).getKey());
 		});
 	}
 	@Test
 	void testRelationsDataInfo() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		tmp.foreach(r -> {
 			assertEquals(false, r.getDataInfo().getInferred());
 			assertEquals(false, r.getDataInfo().getDeletedbyinference());
 			assertEquals("0.91", r.getDataInfo().getTrust());
 			assertEquals(
 				CreateActionSetSparkJob.OPENCITATIONS_CLASSID, r.getDataInfo().getProvenanceaction().getClassid());
 			assertEquals(
 				CreateActionSetSparkJob.OPENCITATIONS_CLASSNAME, r.getDataInfo().getProvenanceaction().getClassname());
 			assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemeid());
 			assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemename());
 		});
 	}
 	@Test
 	void testRelationsSemantics() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		tmp.foreach(r -> {
 			assertEquals("citation", r.getSubRelType());
 			assertEquals("resultResult", r.getRelType());
 		});
 		assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
 		assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
 	}
 	@Test
 	void testRelationsSourceTargetPrefix() throws Exception {
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		tmp.foreach(r -> {
 			assertEquals("50|doi_________::", r.getSource().substring(0, 17));
 			assertEquals("50|doi_________::", r.getTarget().substring(0, 17));
 		});
 	}
 	@Test
 	void testRelationsSourceTargetCouple() throws Exception {
 		final String doi1 = "50|doi_________::"
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
 		final String doi2 = "50|doi_________::"
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
 		final String doi3 = "50|doi_________::"
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
 		final String doi4 = "50|doi_________::"
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
 		final String doi5 = "50|doi_________::"
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
 		final String doi6 = "50|doi_________::"
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();
 		CreateActionSetSparkJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 		JavaRDD<Relation> check = tmp.filter(r -> r.getSource().equals(doi1) || r.getTarget().equals(doi1));
 		assertEquals(10, check.count());
 		check.foreach(r -> {
 			if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) ||
 				r.getSource().equals(doi5) || r.getSource().equals(doi6)) {
 				assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass());
 				assertEquals(doi1, r.getTarget());
 			}
 		});
 		assertEquals(5, check.filter(r -> r.getSource().equals(doi1)).count());
 		check.filter(r -> r.getSource().equals(doi1)).foreach(r -> assertEquals(ModelConstants.CITES, r.getRelClass()));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala
@ -0,0 +1,113 @@
 package eu.dnetlib.dhp.datacite
 import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import org.apache.commons.io.FileUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.functions.{col, count}
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
 import java.nio.file.{Files, Path}
 import java.text.SimpleDateFormat
 import java.util.Locale
 import scala.io.Source
 import org.junit.jupiter.api.Assertions._
@ExtendWith(Array(classOf[MockitoExtension]))
 class DataciteToOAFTest extends  AbstractVocabularyTest{
  private var workingDir:Path = null
  val log: Logger = LoggerFactory.getLogger(getClass)
  @BeforeEach
  def setUp() :Unit = {
    workingDir= Files.createTempDirectory(getClass.getSimpleName)
    super.setUpVocabulary()
  }
  @AfterEach
  def tearDown() :Unit = {
    FileUtils.deleteDirectory(workingDir.toFile)
  }
  @Test
  def testDateMapping:Unit = {
    val inputDate = "2021-07-14T11:52:54+0000"
    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
    val dt = ISO8601FORMAT.parse(inputDate)
    println(dt.getTime)
  }
  @Test
  def testConvert(): Unit = {
    val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
    val conf = new SparkConf()
    val spark:SparkSession =  SparkSession.builder().config(conf)
      .appName(getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
    implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
    val instance = new GenerateDataciteDatasetSpark(null, null, log)
    val targetPath = s"$workingDir/result"
    instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark)
    import spark.implicits._
    val nativeSize =spark.read.load(path).count()
    assertEquals(100, nativeSize)
    val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
    result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
    val t = spark.read.load(targetPath).count()
    assertTrue(t >0)
    spark.stop()
  }
  @Test
  def testMapping() :Unit = {
    val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
    val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
    val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
    res.foreach(r => {
      println (mapper.writeValueAsString(r))
      println("----------------------------")
    })
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
@ -1,13 +1,10 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed
+package eu.dnetlib.dhp.sx.bio
 import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
-import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType}
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
+import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
 import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
 import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
@ -55,7 +52,7 @@ class BioScholixTest extends AbstractVocabularyTest{
  @Test
  def testEBIData() = {
-    val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
+    val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
    new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
  }
@ -65,7 +62,7 @@ class BioScholixTest extends AbstractVocabularyTest{
  def testPubmedToOaf(): Unit = {
    assertNotNull(vocabularies)
    assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
-    val records:String =Source.fromInputStream(getClass.getResourceAsStream("pubmed_dump")).mkString
+    val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
    val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
    assertEquals(10, r.size)
    assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json
@ -0,0 +1,86 @@
 {"10.3390/s18072310": [{"id": "influence", "unit": [{"value": "7.5597134689e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "4.903880192", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.17977512835e-08", "key": "score"}]}]}
 {"10.0000/096020199389707": [{"id": "influence", "unit": [{"value": "6.34596412687e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.641151896994", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.33375102921e-09", "key": "score"}]}]}
 {"10.00000/jpmc.2017.106": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
 {"10.0000/9781845416881": [{"id": "influence", "unit": [{"value": "5.96492048955e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.12641925838e-08", "key": "score"}]}]}
 {"10.0000/anziamj.v0i0.266": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.76260934675e-10", "key": "score"}]}]}
 {"10.0000/anziamj.v48i0.79": [{"id": "influence", "unit": [{"value": "6.93311506443e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.002176782336", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.7668105708e-09", "key": "score"}]}]}
 {"10.0000/anziamj.v50i0.1472": [{"id": "influence", "unit": [{"value": "6.26777280882e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.406656", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.39745193285e-09", "key": "score"}]}]}
 {"10.0000/cja5553": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/czastest.16": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/czastest.17": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
 {"10.0000/czastest.18": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
 {"10.0000/czastest.20": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/czastest.21": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
 {"10.0000/czastest.28": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
 {"10.0000/czastest.60": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/czt.2019.1.2.15": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v4i02.36": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v4i02.37": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v4i02.38": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v5i01.32": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v6i01.24": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v6i01.27": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v6i02.41": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v6i02.44": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i01.40": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i01.42": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i01.47": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i01.51": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i01.52": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i02.86": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i02.88": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v7i02.91": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v8i01.129": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v8i01.180": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
 {"10.0000/geoekonomi.v8i01.87": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
 {"10.0000/hbv2004w010": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w001": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w002": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w003": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w004": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w005": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w006": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2101w007": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2102w001": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hbv2102w010": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i1.13207": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i1.13208": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i1.13209": [{"id": "influence", "unit": [{"value": "6.32078461509e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.6", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.3168486939e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i1.13210": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i1.13211": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i1.13212": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v1i2.13231": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i2.28782": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i2.28783": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i2.28784": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i2.28786": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i2.28787": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i2.28788": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i3.28234": [{"id": "influence", "unit": [{"value": "6.40470414877e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.6", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.89465099068e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i3.28236": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i3.28238": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i3.28239": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i3.28242": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v2i3.28243": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i4.38186": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i4.38187": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i4.38190": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i4.38207": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i4.38209": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i5.41163": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i5.41166": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i5.41167": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i5.41168": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v3i5.41229": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.36360": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.40796": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.41153": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.42511": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.42555": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.42752": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.42768": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i6.42795": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
 {"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json
@ -0,0 +1,38 @@
 {"doi":"10.3390/s18072310","level1":"engineering and technology","level2":"nano-technology","level3":"nanoscience & nanotechnology"}
 {"doi":"10.1111/1365-2656.12831\u000210.17863/cam.24369","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"}
 {"doi":"10.3929/ethz-b-000187584\u000210.1002/chem.201701644","level1":"natural sciences","level2":"NULL","level3":"NULL"}
 {"doi":"10.1080/01913123.2017.1367361","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
 {"doi":"10.1051/e3sconf/20199207011","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"environmental sciences"}
 {"doi":"10.1038/onc.2015.333","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
 {"doi":"10.1093/mnras/staa256","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
 {"doi":"10.1016/j.jclepro.2018.07.166","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
 {"doi":"10.1103/physrevlett.125.037403","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
 {"doi":"10.1080/03602532.2017.1316285","level1":"natural sciences","level2":"NULL","level3":"NULL"}
 {"doi":"10.1001/jamanetworkopen.2019.1868","level1":"medical and health sciences","level2":"other medical science","level3":"health policy & services"}
 {"doi":"10.1128/mra.00874-18","level1":"natural sciences","level2":"biological sciences","level3":"plant biology & botany"}
 {"doi":"10.1016/j.nancom.2018.03.001","level1":"engineering and technology","level2":"NULL","level3":"NULL"}
 {"doi":"10.1112/topo.12174","level1":"natural sciences","level2":"NULL","level3":"NULL"}
 {"doi":"10.12688/wellcomeopenres.15846.1","level1":"medical and health sciences","level2":"health sciences","level3":"NULL"}
 {"doi":"10.21468/scipostphys.3.1.001","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
 {"doi":"10.1088/1741-4326/ab6c77","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
 {"doi":"10.1109/tpwrs.2019.2944747","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
 {"doi":"10.1016/j.expthermflusci.2019.109994\u000210.17863/cam.46212","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"}
 {"doi":"10.1109/tc.2018.2860012","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"computer hardware & architecture"}
 {"doi":"10.1002/mma.6622","level1":"natural sciences","level2":"mathematics","level3":"numerical & computational mathematics"}
 {"doi":"10.1051/radiopro/2020020","level1":"natural sciences","level2":"chemical sciences","level3":"NULL"}
 {"doi":"10.1007/s12268-019-1003-4","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
 {"doi":"10.3390/cancers12010236","level1":"medical and health sciences","level2":"health sciences","level3":"biochemistry & molecular biology"}
 {"doi":"10.6084/m9.figshare.9912614\u000210.6084/m9.figshare.9912614.v1\u000210.1080/00268976.2019.1665199","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
 {"doi":"10.1175/jpo-d-17-0239.1","level1":"natural sciences","level2":"biological sciences","level3":"marine biology & hydrobiology"}
 {"doi":"10.1007/s13218-020-00674-7","level1":"engineering and technology","level2":"industrial biotechnology","level3":"industrial engineering & automation"}
 {"doi":"10.1016/j.psyneuen.2016.02.003\u000210.1016/j.psyneuen.2016.02.00310.7892/boris.78886\u000210.7892/boris.78886","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
 {"doi":"10.1109/ted.2018.2813542","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
 {"doi":"10.3989/scimar.04739.25a","level1":"natural sciences","level2":"biological sciences","level3":"NULL"}
 {"doi":"10.3390/su12187503","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"NULL"}
 {"doi":"10.1016/j.ccell.2018.08.017","level1":"medical and health sciences","level2":"basic medicine","level3":"biochemistry & molecular biology"}
 {"doi":"10.1103/physrevresearch.2.023322","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
 {"doi":"10.1039/c8cp03234c","level1":"natural sciences","level2":"NULL","level3":"NULL"}
 {"doi":"10.5281/zenodo.3696557\u000210.5281/zenodo.3696556\u000210.1109/jsac.2016.2545384","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
 {"doi":"10.1038/ng.3667\u000210.1038/ng.3667.\u000210.17615/tct6-4m26\u000210.17863/cam.15649","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
 {"doi":"10.1016/j.jclepro.2019.119065","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
 {"doi":"10.1111/pce.13392","level1":"agricultural and veterinary sciences","level2":"agriculture, forestry, and fisheries","level3":"agronomy & agriculture"}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv
@ -0,0 +1,38 @@
 dedup_wf_001::ddcc7a56fa13e49bcc59c6bdd19ad26c	10.3390/s18072310	engineering and technology	nano-technology	nanoscience & nanotechnology
 dedup_wf_001::b76062d56e28224eac56111a4e1e5ecf	10.1111/1365-2656.1283110.17863/cam.24369	social sciences	psychology and cognitive sciences	NULL
 dedup_wf_001::bb752acb8f403a25fa7851a302f7b7ac	10.3929/ethz-b-00018758410.1002/chem.201701644	natural sciences	NULL	NULL
 dedup_wf_001::2f1435a9201ecf5cbbcb12c9b2d971cd	10.1080/01913123.2017.1367361	medical and health sciences	clinical medicine	oncology & carcinogenesis
 dedup_wf_001::fc9e47ec16c67b101724320d4b030514	10.1051/e3sconf/20199207011	natural sciences	earth and related environmental sciences	environmental sciences
 dedup_wf_001::caa1e5b4de387cb31751552f4f0f5d72	10.1038/onc.2015.333	medical and health sciences	clinical medicine	oncology & carcinogenesis
 dedup_wf_001::c2a98df5637d69bf0524eaf40fe6bf11	10.1093/mnras/staa256	natural sciences	physical sciences	NULL
 dedup_wf_001::c221262bdc77cbfd59859a402f0e3991	10.1016/j.jclepro.2018.07.166	engineering and technology	other engineering and technologies	building & construction
 doiboost____::d56d9dc21f317b3e009d5b6c8ea87212	10.1103/physrevlett.125.037403	natural sciences	physical sciences	nuclear & particles physics
 dedup_wf_001::8a7269c8ee6470b2fb4fd384bc389e08	10.1080/03602532.2017.1316285	natural sciences	NULL	NULL
 dedup_wf_001::28342ebbc19833e4e1f4a2b23cf5ee20	10.1001/jamanetworkopen.2019.1868	medical and health sciences	other medical science	health policy & services
 dedup_wf_001::c1e1daf2b55dd9ec8e1c7c7458bbc7bc	10.1128/mra.00874-18	natural sciences	biological sciences	plant biology & botany
 dedup_wf_001::a2ef4a2720c71907180750e5871298ef	10.1016/j.nancom.2018.03.001	engineering and technology	NULL	NULL
 dedup_wf_001::676f46a31519e83a89efcb1c626286fb	10.1112/topo.12174	natural sciences	NULL	NULL
 dedup_wf_001::6f2761642f1e39313388e2c4060657dd	10.12688/wellcomeopenres.15846.1	medical and health sciences	health sciences	NULL
 dedup_wf_001::e414c1dec599521a9635a60de0f6755b	10.21468/scipostphys.3.1.001	natural sciences	physical sciences	NULL
 dedup_wf_001::f3395fe0f330164ea424dc61c86c9a3d	10.1088/1741-4326/ab6c77	natural sciences	physical sciences	nuclear & particles physics
 dedup_wf_001::a4f32a97a783117012f1de11797e73f2	10.1109/tpwrs.2019.2944747	engineering and technology	electrical engineering, electronic engineering, information engineering	electrical & electronic engineering
 dedup_wf_001::313ae1cd083ae1696d12dd1909f97df8	10.1016/j.expthermflusci.2019.10999410.17863/cam.46212	engineering and technology	mechanical engineering	mechanical engineering & transports
 dedup_wf_001::2a300a7d3ca7347791ebcef986bc0682	10.1109/tc.2018.2860012	engineering and technology	electrical engineering, electronic engineering, information engineering	computer hardware & architecture
 doiboost____::5b79bd7bd9f87361b4a4abc3cbb2df75	10.1002/mma.6622	natural sciences	mathematics	numerical & computational mathematics
 dedup_wf_001::6a3f61f217a2519fbaddea1094e3bfc2	10.1051/radiopro/2020020	natural sciences	chemical sciences	NULL
 dedup_wf_001::a3f0430309a639f4234a0e57b10f2dee	10.1007/s12268-019-1003-4	medical and health sciences	basic medicine	NULL
 dedup_wf_001::b6b8a3a1cccbee459cf3343485efdb12	10.3390/cancers12010236	medical and health sciences	health sciences	biochemistry & molecular biology
 dedup_wf_001::dd06ee7974730e7b09a4f03c83b3f9bd	10.6084/m9.figshare.991261410.6084/m9.figshare.9912614.v110.1080/00268976.2019.1665199	natural sciences	chemical sciences	physical chemistry
 dedup_wf_001::027c78bef6f972b5e26dfea55d30fbe3	10.1175/jpo-d-17-0239.1	natural sciences	biological sciences	marine biology & hydrobiology
 dedup_wf_001::43edc179aa9e1fbaf582c5203b18b519	10.1007/s13218-020-00674-7	engineering and technology	industrial biotechnology	industrial engineering & automation
 dedup_wf_001::e7770e11cd6eb514bb52c07b5a8a80f0	10.1016/j.psyneuen.2016.02.00310.1016/j.psyneuen.2016.02.00310.7892/boris.7888610.7892/boris.78886	medical and health sciences	basic medicine	NULL
 dedup_wf_001::80bc15d69bdc589149631f3439dde5aa	10.1109/ted.2018.2813542	engineering and technology	electrical engineering, electronic engineering, information engineering	electrical & electronic engineering
 dedup_wf_001::42c1cfa33e7872944b920cff90f4d99e	10.3989/scimar.04739.25a	natural sciences	biological sciences	NULL
 dedup_wf_001::9bacdbbaa9da3658b7243d5de8e3ce14	10.3390/su12187503	natural sciences	earth and related environmental sciences	NULL
 dedup_wf_001::59e43d3527dcfecb6097fbd5740c8950	10.1016/j.ccell.2018.08.017	medical and health sciences	basic medicine	biochemistry & molecular biology
 doiboost____::e024d1b738df3b24bc58fa0228542571	10.1103/physrevresearch.2.023322	natural sciences	physical sciences	nuclear & particles physics
 dedup_wf_001::66e9a3237fa8178886d26d3c2d5b9e66	10.1039/c8cp03234c	natural sciences	NULL	NULL
 dedup_wf_001::83737ab4205bae751571bb3b166efa18	10.5281/zenodo.369655710.5281/zenodo.369655610.1109/jsac.2016.2545384	engineering and technology	electrical engineering, electronic engineering, information engineering	networking & telecommunications
 dedup_wf_001::e3f892db413a689e572dd256acad55fe	10.1038/ng.366710.1038/ng.3667.10.17615/tct6-4m2610.17863/cam.15649	medical and health sciences	health sciences	genetics & heredity
 dedup_wf_001::14ba594e8fd081847bc3f50f56335003	10.1016/j.jclepro.2019.119065	engineering and technology	other engineering and technologies	building & construction
 dedup_wf_001::08ac7b33a41bcea2d055ecd8585d632e	10.1111/pce.13392	agricultural and veterinary sciences	agriculture, forestry, and fisheries	agronomy & agriculture
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,2 @@`
							`#!/bin/bash`
							`for file in $(echo $1 \| tr ";" "\n"); do curl -L $(echo $file \| cut -d '@' -f 1 ) \| hdfs dfs -put - $2/$(echo $file \| cut -d '@' -f 2) ; done;`