Updates Promotion DBs

- Add a step for promoting the splitted monitor DBs
mapping funding relations from Datacite should be done according to the actual result identifier
2023-07-13 15:35:46 +03:00 · 2021-07-23 18:14:37 +02:00 · 2021-07-23 11:55:55 +02:00 · 2021-07-20 19:31:43 +02:00 · 2021-07-20 11:51:33 +02:00 · 2021-07-19 17:52:24 +02:00
216 changed files with 8104 additions and 12472 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -98,7 +98,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {

 			Result r = (Result) value;

-			if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
+			if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) {
 				return false;
 			}

--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/terms.txt
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/terms.txt
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
@ -4,8 +4,6 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.common.ModelConstants
-import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
-import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
 import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
 import eu.dnetlib.dhp.utils.DHPUtils
@ -19,13 +17,16 @@ import java.text.SimpleDateFormat
 import java.time.LocalDate
 import java.time.chrono.ThaiBuddhistDate
 import java.time.format.DateTimeFormatter
-import java.util.{Date, Locale}
 import java.util.regex.Pattern
+import java.util.{Date, Locale}
 import scala.collection.JavaConverters._
 import scala.io.{Codec, Source}
+import scala.language.postfixOps

 case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}

+case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
+
 case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}

 case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
@ -44,6 +45,37 @@ case class HostedByMapType(openaire_id: String, datacite_name: String, official_

 object DataciteToOAFTransformation {

+  val REL_TYPE_VALUE:String = "resultResult"
+  val DATE_RELATION_KEY = "RelationDate"
+
+  val subRelTypeMapping: Map[String,(String,String)] = Map(
+    "References" ->("IsReferencedBy","relationship"),
+    "IsSupplementTo" ->("IsSupplementedBy","supplement"),
+    "IsPartOf" ->("HasPart","part"),
+    "HasPart" ->("IsPartOf","part"),
+    "IsVersionOf" ->("HasVersion","version"),
+    "HasVersion" ->("IsVersionOf","version"),
+    "IsIdenticalTo" ->("IsIdenticalTo","relationship"),
+    "IsPreviousVersionOf" ->("IsNewVersionOf","version"),
+    "IsContinuedBy" ->("Continues","relationship"),
+    "Continues" ->("IsContinuedBy","relationship"),
+    "IsNewVersionOf" ->("IsPreviousVersionOf","version"),
+    "IsSupplementedBy" ->("IsSupplementTo","supplement"),
+    "IsDocumentedBy" ->("Documents","relationship"),
+    "IsSourceOf" ->("IsDerivedFrom","relationship"),
+    "Cites" ->("IsCitedBy","citation"),
+    "IsCitedBy" ->("Cites","citation"),
+    "IsDerivedFrom" ->("IsSourceOf","relationship"),
+    "IsVariantFormOf" ->("IsDerivedFrom","version"),
+    "IsReferencedBy" ->("References","relationship"),
+    "IsObsoletedBy" ->("IsNewVersionOf","version"),
+    "Reviews" ->("IsReviewedBy","review"),
+    "Documents" ->("IsDocumentedBy","relationship"),
+    "IsCompiledBy" ->("Compiles","relationship"),
+    "Compiles" ->("IsCompiledBy","relationship"),
+    "IsReviewedBy" ->("Reviews","review")
+  )
+
  implicit val codec: Codec = Codec("UTF-8")
  codec.onMalformedInput(CodingErrorAction.REPLACE)
  codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
@ -174,7 +206,6 @@ object DataciteToOAFTransformation {
      case _: Throwable => ""
    }
  }
-
  def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
    if (resourceType != null && resourceType.nonEmpty) {
      val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -243,6 +274,7 @@ object DataciteToOAFTransformation {
   * As describe in ticket #6377
   * when the result come from figshare we need to remove subject
   * and set Access rights OPEN.
+   *
   * @param r
   */
  def fix_figshare(r: Result): Unit = {
@ -259,6 +291,12 @@ object DataciteToOAFTransformation {

  }

+
+  def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
+    val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
+    s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
+  }
+
  def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
    OafMapperUtils.structuredProperty(dt, q, null)
  }
@ -297,7 +335,7 @@ object DataciteToOAFTransformation {
  }


-  def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup): List[Oaf] = {
+  def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
    if (filter_json(input))
      return List()

@ -414,6 +452,7 @@ object DataciteToOAFTransformation {
      }
    }

+
    result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
      .map(d => (extract_date(d.date.get), d.dateType.get))
      .filter(d => d._1.isDefined)
@ -494,10 +533,23 @@ object DataciteToOAFTransformation {
    } yield awardUri

    result.setId(IdentifierFactory.createIdentifier(result))
+    var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
+
+    fix_figshare(result)
+
    if (result.getId == null)
      return List()
-    val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
-    fix_figshare(result)
+
+    if (exportLinks) {
+      val rels: List[RelatedIdentifierType] = for {
+        JObject(relIdentifier) <- json \\ "relatedIdentifiers"
+        JField("relationType", JString(relationType)) <- relIdentifier
+        JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
+        JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
+      } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
+
+      relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
+    }
    if (relations != null && relations.nonEmpty) {
      List(result) ::: relations
    }
@ -505,6 +557,36 @@ object DataciteToOAFTransformation {
      List(result)
  }

+  private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = {
+    rels
+      .filter(r =>
+        subRelTypeMapping.contains(r.relationType) && (
+          r.relatedIdentifierType.equalsIgnoreCase("doi") ||
+            r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
+            r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
+      )
+      .map(r => {
+        val rel = new Relation
+        rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
+        rel.setDataInfo(dataInfo)
+
+        val subRelType = subRelTypeMapping(r.relationType)._2
+        rel.setRelType(REL_TYPE_VALUE)
+        rel.setSubRelType(subRelType)
+        rel.setRelClass(r.relationType)
+
+        val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+
+        rel.setProperties(List(dateProps).asJava)
+
+        rel.setSource(id)
+        rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}")
+        rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
+        rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut)
+        rel
+      })(collection breakOut)
+  }
+
  def generateDataInfo(trust: String): DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala
@ -0,0 +1,46 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
+import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.utils.ISLookupClientFactory
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.io.Source
+
+object FilterCrossrefEntitiesSpark {
+
+  val log: Logger = LoggerFactory.getLogger(getClass.getClass)
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf
+    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
+    parser.parseArgument(args)
+    val master = parser.get("master")
+    val sourcePath = parser.get("sourcePath")
+    log.info("sourcePath: {}", sourcePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)
+
+
+
+    val spark: SparkSession = SparkSession.builder().config(conf)
+      .appName(getClass.getSimpleName)
+      .master(master)
+      .getOrCreate()
+
+
+
+    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+    implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
+
+    val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
+
+    d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
+
+  }
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala
@ -22,6 +22,7 @@ object GenerateDataciteDatasetSpark {
    val master = parser.get("master")
    val sourcePath = parser.get("sourcePath")
    val targetPath = parser.get("targetPath")
+    val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
    val isLookupUrl: String = parser.get("isLookupUrl")
    log.info("isLookupUrl: {}", isLookupUrl)

@ -40,7 +41,7 @@ object GenerateDataciteDatasetSpark {

    spark.read.load(sourcePath).as[DataciteType]
      .filter(d => d.isActive)
-      .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies))
+      .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
      .filter(d => d != null)
      .write.mode(SaveMode.Overwrite).save(targetPath)
  }
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json
@ -1,20 +1,21 @@
 [
-  {
-    "paramName": "n",
-    "paramLongName": "nameNode",
-    "paramDescription": "the Name Node",
-    "paramRequired": true
-  },
  {
    "paramName": "s",
    "paramLongName": "sourcePath",
-    "paramDescription": "the source path",
+    "paramDescription": "the source mdstore path",
    "paramRequired": true
  },
+
  {
    "paramName": "t",
    "paramLongName": "targetPath",
-    "paramDescription": "the target path",
+    "paramDescription": "the target mdstore path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "m",
+    "paramLongName": "master",
+    "paramDescription": "the master name",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
@ -23,5 +23,12 @@
    "paramLongName": "isLookupUrl",
    "paramDescription": "the isLookup URL",
    "paramRequired": true
+  },
+  {
+    "paramName": "l",
+    "paramLongName": "exportLinks",
+    "paramDescription": "should export also links",
+    "paramRequired": false
  }
+
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
@ -4,6 +4,10 @@
            <name>mainPath</name>
            <description>the working path of Datacite stores</description>
        </property>
+        <property>
+            <name>oafTargetPath</name>
+            <description>the target path where the OAF records are stored</description>
+        </property>
        <property>
            <name>isLookupUrl</name>
            <description>The IS lookUp service endopoint</description>
@ -13,15 +17,26 @@
            <value>100</value>
            <description>The request block size</description>
        </property>
+        <property>
+            <name>exportLinks</name>
+            <value>false</value>
+            <description>instructs the transformation phase to produce the links or not</description>
+        </property>

    </parameters>

-    <start to="ImportDatacite"/>
+    <start to="resume_from"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

+    <decision name="resume_from">
+        <switch>
+            <case to="TransformDatacite">${wf:conf('resumeFrom') eq 'TransformDatacite'}</case>
+            <default to="ImportDatacite"/>
+        </switch>
+    </decision>

    <action name="ImportDatacite">
        <spark xmlns="uri:oozie:spark-action:0.2">
@ -45,12 +60,11 @@
            <arg>--master</arg><arg>yarn-cluster</arg>
            <arg>--blocksize</arg><arg>${blocksize}</arg>
        </spark>
-        <ok to="TransformJob"/>
+        <ok to="TransformDatacite"/>
        <error to="Kill"/>
    </action>

-
-    <action name="TransformJob">
+    <action name="TransformDatacite">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
@ -68,8 +82,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
-            <arg>--targetPath</arg><arg>${mainPath}/datacite_oaf</arg>
+            <arg>--targetPath</arg><arg>${oafTargetPath}</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--exportLinks</arg><arg>${exportLinks}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml
@ -1,4 +1,12 @@
 <configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
@ -7,6 +15,7 @@
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
+
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml
@ -0,0 +1,84 @@
+<workflow-app name="Generate_Datacite_and_Crossref_dump_for_Scholexplorer" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>datacitePath</name>
+            <description>the path of Datacite spark dataset</description>
+        </property>
+        <property>
+            <name>isLookupUrl</name>
+            <description>The IS lookUp service endopoint</description>
+        </property>
+        <property>
+            <name>crossrefPath</name>
+            <description>the path of Crossref spark dataset</description>
+        </property>
+
+        <property>
+            <name>targetPath</name>
+            <description>the path of Crossref spark dataset</description>
+        </property>
+
+    </parameters>
+
+    <start to="ImportDatacite"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+
+    <action name="ImportDatacite">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>ImportDatacite</name>
+            <class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${datacitePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/datacite_oaf</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--exportLinks</arg><arg>true</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="FilterCrossrefEntities"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="FilterCrossrefEntities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>FilterCrossrefEntities</name>
+            <class>eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${crossrefPath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/crossref_oaf</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
@ -1,12 +1,15 @@
 package eu.dnetlib.dhp.actionmanager.datacite


+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.databind.SerializationFeature
+
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import org.junit.jupiter.api.extension.ExtendWith
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
-import org.codehaus.jackson.map.ObjectMapper
+
 import scala.io.Source

@ExtendWith(Array(classOf[MockitoExtension]))
@ -25,9 +28,15 @@ class DataciteToOAFTest extends  AbstractVocabularyTest{



-    val mapper = new ObjectMapper()
-    val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies )
-    println (mapper.defaultPrettyPrintingWriter().writeValueAsString(res.head))
+    val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
+    val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
+
+    res.foreach(r => {
+      println (mapper.writeValueAsString(r))
+      println("----------------------------")
+
+    })
+


  }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java
@ -6,13 +6,13 @@ import static org.mockito.Mockito.lenient;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
+import java.util.Objects;

 import org.apache.commons.io.IOUtils;
 import org.mockito.Mock;

 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.transformation.TransformationFactory;
-import eu.dnetlib.dhp.transformation.TransformationJobTest;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

@ -34,16 +34,22 @@ public abstract class AbstractVocabularyTest {

 	private static List<String> vocs() throws IOException {
 		return IOUtils
-			.readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt"));
+			.readLines(
+				Objects
+					.requireNonNull(
+						AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")));
 	}

 	private static List<String> synonyms() throws IOException {
 		return IOUtils
-			.readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt"));
+			.readLines(
+				Objects
+					.requireNonNull(
+						AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")));
 	}

 	protected void mockupTrasformationRule(final String trule, final String path) throws Exception {
-		final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path));
+		final String trValue = IOUtils.toString(Objects.requireNonNull(this.getClass().getResourceAsStream(path)));

 		lenient()
 			.when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule)))
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt
@ -128,6 +128,7 @@ dnet:publication_resource @=@ 0002 @=@ scientific book
 dnet:publication_resource @=@ 0002 @=@ Монография
 dnet:publication_resource @=@ 0002 @=@ Учебник
 dnet:publication_resource @=@ 0037 @=@ clinicalTrial
+dnet:publication_resource @=@ 0037 @=@ Clinical Trial
 dnet:publication_resource @=@ 0037 @=@ http://purl.org/coar/resource_type/c_cb28
 dnet:publication_resource @=@ 0022 @=@ collection
 dnet:publication_resource @=@ 0004 @=@ A4 Artikkeli konferenssijulkaisussa
--- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml
@ -1,82 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-    <parent>
-        <artifactId>dhp-workflows</artifactId>
-        <groupId>eu.dnetlib.dhp</groupId>
-        <version>1.2.4-SNAPSHOT</version>
-    </parent>
-    <modelVersion>4.0.0</modelVersion>
-
-    <artifactId>dhp-dedup-scholexplorer</artifactId>
-
-    <build>
-        <plugins>
-            <plugin>
-                <groupId>net.alchim31.maven</groupId>
-                <artifactId>scala-maven-plugin</artifactId>
-                <version>4.0.1</version>
-                <executions>
-                    <execution>
-                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
-                        <goals>
-                            <goal>add-source</goal>
-                            <goal>compile</goal>
-                        </goals>
-                    </execution>
-                    <execution>
-                        <id>scala-test-compile</id>
-                        <phase>process-test-resources</phase>
-                        <goals>
-                            <goal>testCompile</goal>
-                        </goals>
-                    </execution>
-                </executions>
-                <configuration>
-                    <scalaVersion>${scala.version}</scalaVersion>
-                </configuration>
-            </plugin>
-        </plugins>
-
-    </build>
-
-    <dependencies>
-
-        <dependency>
-            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-core_2.11</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-sql_2.11</artifactId>
-        </dependency>
-
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-common</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>eu.dnetlib</groupId>
-            <artifactId>dnet-pace-core</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-graphx_2.11</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-        </dependency>
-
-
-
-    </dependencies>
-
-
-</project>
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java
@ -1,121 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import static java.util.Collections.reverseOrder;
-import static java.util.Map.Entry.comparingByValue;
-import static java.util.stream.Collectors.toMap;
-
-import static org.apache.commons.lang.StringUtils.endsWith;
-import static org.apache.commons.lang.StringUtils.substringBefore;
-
-import java.time.Year;
-import java.util.*;
-import java.util.stream.Collectors;
-
-import org.apache.commons.lang.StringUtils;
-
-import eu.dnetlib.dhp.schema.oaf.Field;
-
-public class DatePicker {
-
-	private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
-	private static final String DATE_DEFAULT_SUFFIX = "01-01";
-	private static final int YEAR_LB = 1300;
-	private static final int YEAR_UB = Year.now().getValue() + 5;
-
-	public static Field<String> pick(final Collection<String> dateofacceptance) {
-
-		final Map<String, Integer> frequencies = dateofacceptance
-			.parallelStream()
-			.filter(StringUtils::isNotBlank)
-			.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
-
-		if (frequencies.isEmpty()) {
-			return new Field<>();
-		}
-
-		final Field<String> date = new Field<>();
-		date.setValue(frequencies.keySet().iterator().next());
-
-		// let's sort this map by values first, filtering out invalid dates
-		final Map<String, Integer> sorted = frequencies
-			.entrySet()
-			.stream()
-			.filter(d -> StringUtils.isNotBlank(d.getKey()))
-			.filter(d -> d.getKey().matches(DATE_PATTERN))
-			.filter(d -> inRange(d.getKey()))
-			.sorted(reverseOrder(comparingByValue()))
-			.collect(
-				toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
-
-		// shortcut
-		if (sorted.size() == 0) {
-			return date;
-		}
-
-		// voting method (1/3 + 1) wins
-		if (sorted.size() >= 3) {
-			final int acceptThreshold = (sorted.size() / 3) + 1;
-			final List<String> accepted = sorted
-				.entrySet()
-				.stream()
-				.filter(e -> e.getValue() >= acceptThreshold)
-				.map(e -> e.getKey())
-				.collect(Collectors.toList());
-
-			// cannot find strong majority
-			if (accepted.isEmpty()) {
-				final int max = sorted.values().iterator().next();
-				Optional<String> first = sorted
-					.entrySet()
-					.stream()
-					.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
-					.map(Map.Entry::getKey)
-					.findFirst();
-				if (first.isPresent()) {
-					date.setValue(first.get());
-					return date;
-				}
-
-				date.setValue(sorted.keySet().iterator().next());
-				return date;
-			}
-
-			if (accepted.size() == 1) {
-				date.setValue(accepted.get(0));
-				return date;
-			} else {
-				final Optional<String> first = accepted
-					.stream()
-					.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
-					.findFirst();
-				if (first.isPresent()) {
-					date.setValue(first.get());
-					return date;
-				}
-
-				return date;
-			}
-
-			// 1st non YYYY-01-01 is returned
-		} else {
-			if (sorted.size() == 2) {
-				for (Map.Entry<String, Integer> e : sorted.entrySet()) {
-					if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
-						date.setValue(e.getKey());
-						return date;
-					}
-				}
-			}
-
-			// none of the dates seems good enough, return the 1st one
-			date.setValue(sorted.keySet().iterator().next());
-			return date;
-		}
-	}
-
-	private static boolean inRange(final String date) {
-		final int year = Integer.parseInt(substringBefore(date, "-"));
-		return year >= YEAR_LB && year <= YEAR_UB;
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
@ -1,327 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import java.util.Collection;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SparkSession;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Lists;
-
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import scala.Tuple2;
-
-public class DedupRecordFactory {
-
-	public static JavaRDD<OafEntity> createDedupRecord(
-		final JavaSparkContext sc,
-		final SparkSession spark,
-		final String mergeRelsInputPath,
-		final String entitiesInputPath,
-		final OafEntityType entityType,
-		final DedupConfig dedupConf) {
-		long ts = System.currentTimeMillis();
-		// <id, json_entity>
-		final JavaPairRDD<String, String> inputJsonEntities = spark
-			.read()
-			.load(entitiesInputPath)
-			.as(Encoders.kryo(Oaf.class))
-			.map(
-				(MapFunction<Oaf, String>) p -> new org.codehaus.jackson.map.ObjectMapper().writeValueAsString(p),
-				Encoders.STRING())
-			.javaRDD()
-			.mapToPair(
-				(PairFunction<String, String, String>) it -> new Tuple2<>(
-					MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it));
-
-		// <source, target>: source is the dedup_id, target is the id of the mergedIn
-		JavaPairRDD<String, String> mergeRels = spark
-			.read()
-			.load(mergeRelsInputPath)
-			.as(Encoders.bean(Relation.class))
-			.where("relClass=='merges'")
-			.javaRDD()
-			.mapToPair(
-				(PairFunction<Relation, String, String>) r -> new Tuple2<String, String>(r.getTarget(), r.getSource()));
-
-		// <dedup_id, json_entity_merged>
-		final JavaPairRDD<String, String> joinResult = mergeRels
-			.join(inputJsonEntities)
-			.mapToPair(
-				(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
-
-		JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
-
-		switch (entityType) {
-			case publication:
-				return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
-			case dataset:
-				return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
-			case project:
-				return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
-			case software:
-				return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
-			case datasource:
-				return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
-			case organization:
-				return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
-			case otherresearchproduct:
-				return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
-			default:
-				return null;
-		}
-	}
-
-	private static DLIPublication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
-
-		DLIPublication p = new DLIPublication(); // the result of the merge, to be returned at the end
-
-		p.setId(e._1());
-
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-
-		final Collection<String> dateofacceptance = Lists.newArrayList();
-
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					pub -> {
-						try {
-							DLIPublication publication = mapper.readValue(pub, DLIPublication.class);
-
-							p.mergeFrom(publication);
-							p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
-							// add to the list if they are not null
-							if (publication.getDateofacceptance() != null)
-								dateofacceptance.add(publication.getDateofacceptance().getValue());
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-		p.setDateofacceptance(DatePicker.pick(dateofacceptance));
-		if (p.getDataInfo() == null)
-			p.setDataInfo(new DataInfo());
-		p.getDataInfo().setTrust("0.9");
-		p.setLastupdatetimestamp(ts);
-		return p;
-	}
-
-	private static DLIDataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
-
-		DLIDataset d = new DLIDataset(); // the result of the merge, to be returned at the end
-
-		d.setId(e._1());
-
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-
-		final Collection<String> dateofacceptance = Lists.newArrayList();
-
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					dat -> {
-						try {
-							Dataset dataset = mapper.readValue(dat, Dataset.class);
-
-							d.mergeFrom(dataset);
-							d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
-							// add to the list if they are not null
-							if (dataset.getDateofacceptance() != null)
-								dateofacceptance.add(dataset.getDateofacceptance().getValue());
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-		d.setDateofacceptance(DatePicker.pick(dateofacceptance));
-		if (d.getDataInfo() == null)
-			d.setDataInfo(new DataInfo());
-		d.getDataInfo().setTrust("0.9");
-		d.setLastupdatetimestamp(ts);
-		return d;
-	}
-
-	private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
-
-		Project p = new Project(); // the result of the merge, to be returned at the end
-
-		p.setId(e._1());
-
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					proj -> {
-						try {
-							Project project = mapper.readValue(proj, Project.class);
-
-							p.mergeFrom(project);
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-		if (p.getDataInfo() == null)
-			p.setDataInfo(new DataInfo());
-		p.getDataInfo().setTrust("0.9");
-		p.setLastupdatetimestamp(ts);
-		return p;
-	}
-
-	private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
-
-		Software s = new Software(); // the result of the merge, to be returned at the end
-
-		s.setId(e._1());
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-		final Collection<String> dateofacceptance = Lists.newArrayList();
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					soft -> {
-						try {
-							Software software = mapper.readValue(soft, Software.class);
-
-							s.mergeFrom(software);
-							s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
-							// add to the list if they are not null
-							if (software.getDateofacceptance() != null)
-								dateofacceptance.add(software.getDateofacceptance().getValue());
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-		s.setDateofacceptance(DatePicker.pick(dateofacceptance));
-		if (s.getDataInfo() == null)
-			s.setDataInfo(new DataInfo());
-		s.getDataInfo().setTrust("0.9");
-		s.setLastupdatetimestamp(ts);
-		return s;
-	}
-
-	private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
-		Datasource d = new Datasource(); // the result of the merge, to be returned at the end
-		d.setId(e._1());
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					dat -> {
-						try {
-							Datasource datasource = mapper.readValue(dat, Datasource.class);
-
-							d.mergeFrom(datasource);
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-		if (d.getDataInfo() == null)
-			d.setDataInfo(new DataInfo());
-		d.getDataInfo().setTrust("0.9");
-		d.setLastupdatetimestamp(ts);
-		return d;
-	}
-
-	private static Organization organizationMerger(
-		Tuple2<String, Iterable<String>> e, final long ts) {
-
-		Organization o = new Organization(); // the result of the merge, to be returned at the end
-
-		o.setId(e._1());
-
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-
-		StringBuilder trust = new StringBuilder("0.0");
-
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					pub -> {
-						try {
-							Organization organization = mapper.readValue(pub, Organization.class);
-
-							final String currentTrust = organization.getDataInfo().getTrust();
-							if (!"1.0".equals(currentTrust)) {
-								trust.setLength(0);
-								trust.append(currentTrust);
-							}
-							o.mergeFrom(organization);
-
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-
-		if (o.getDataInfo() == null) {
-			o.setDataInfo(new DataInfo());
-		}
-		if (o.getDataInfo() == null)
-			o.setDataInfo(new DataInfo());
-		o.getDataInfo().setTrust("0.9");
-		o.setLastupdatetimestamp(ts);
-
-		return o;
-	}
-
-	private static OtherResearchProduct otherresearchproductMerger(
-		Tuple2<String, Iterable<String>> e, final long ts) {
-
-		OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be
-		// returned at the end
-
-		o.setId(e._1());
-
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-
-		final Collection<String> dateofacceptance = Lists.newArrayList();
-
-		if (e._2() != null)
-			e
-				._2()
-				.forEach(
-					orp -> {
-						try {
-							OtherResearchProduct otherResearchProduct = mapper
-								.readValue(orp, OtherResearchProduct.class);
-
-							o.mergeFrom(otherResearchProduct);
-							o
-								.setAuthor(
-									DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
-							// add to the list if they are not null
-							if (otherResearchProduct.getDateofacceptance() != null)
-								dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
-						} catch (Exception exc) {
-							throw new RuntimeException(exc);
-						}
-					});
-		if (o.getDataInfo() == null)
-			o.setDataInfo(new DataInfo());
-		o.setDateofacceptance(DatePicker.pick(dateofacceptance));
-		o.getDataInfo().setTrust("0.9");
-		o.setLastupdatetimestamp(ts);
-		return o;
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java
@ -1,239 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import java.io.IOException;
-import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
-import java.text.Normalizer;
-import java.util.*;
-import java.util.stream.Collectors;
-
-import org.apache.commons.codec.binary.Hex;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.util.LongAccumulator;
-
-import com.google.common.collect.Sets;
-import com.wcohen.ss.JaroWinkler;
-
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.model.Person;
-import scala.Tuple2;
-
-public class DedupUtility {
-	private static final Double THRESHOLD = 0.95;
-
-	public static Map<String, LongAccumulator> constructAccumulator(
-		final DedupConfig dedupConf, final SparkContext context) {
-
-		Map<String, LongAccumulator> accumulators = new HashMap<>();
-
-		String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
-		accumulators.put(acc1, context.longAccumulator(acc1));
-		String acc2 = String
-			.format(
-				"%s::%s",
-				dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
-		accumulators.put(acc2, context.longAccumulator(acc2));
-		String acc3 = String
-			.format(
-				"%s::%s",
-				dedupConf.getWf().getEntityType(),
-				String
-					.format(
-						"Skipped records for count(%s) >= %s",
-						dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
-		accumulators.put(acc3, context.longAccumulator(acc3));
-		String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
-		accumulators.put(acc4, context.longAccumulator(acc4));
-		String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
-		accumulators.put(acc5, context.longAccumulator(acc5));
-		String acc6 = String
-			.format(
-				"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
-		accumulators.put(acc6, context.longAccumulator(acc6));
-
-		return accumulators;
-	}
-
-	public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
-		return context.textFile(path);
-	}
-
-	public static void deleteIfExists(String path) throws IOException {
-		Configuration conf = new Configuration();
-		FileSystem fileSystem = FileSystem.get(conf);
-		if (fileSystem.exists(new Path(path))) {
-			fileSystem.delete(new Path(path), true);
-		}
-	}
-
-	public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
-
-		Configuration conf = new Configuration();
-		FileSystem fileSystem = FileSystem.get(conf);
-		FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
-
-		return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
-	}
-
-	static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
-		final StringWriter sw = new StringWriter();
-		try {
-			IOUtils.copy(clazz.getResourceAsStream(filename), sw);
-			return sw.toString();
-		} catch (final IOException e) {
-			throw new RuntimeException("cannot load resource from classpath: " + filename);
-		}
-	}
-
-	static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
-		return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
-	}
-
-	public static String md5(final String s) {
-		try {
-			final MessageDigest md = MessageDigest.getInstance("MD5");
-			md.update(s.getBytes(StandardCharsets.UTF_8));
-			return new String(Hex.encodeHex(md.digest()));
-		} catch (final Exception e) {
-			System.err.println("Error creating id");
-			return null;
-		}
-	}
-
-	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
-		int pa = countAuthorsPids(a);
-		int pb = countAuthorsPids(b);
-		List<Author> base, enrich;
-		int sa = authorsSize(a);
-		int sb = authorsSize(b);
-
-		if (pa == pb) {
-			base = sa > sb ? a : b;
-			enrich = sa > sb ? b : a;
-		} else {
-			base = pa > pb ? a : b;
-			enrich = pa > pb ? b : a;
-		}
-		enrichPidFromList(base, enrich);
-		return base;
-	}
-
-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
-		if (base == null || enrich == null)
-			return;
-		final Map<String, Author> basePidAuthorMap = base
-			.stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a)))
-			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-
-		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
-			.stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(
-				a -> a
-					.getPid()
-					.stream()
-					.filter(p -> !basePidAuthorMap.containsKey(p.toComparableString()))
-					.map(p -> new Tuple2<>(p, a)))
-			.collect(Collectors.toList());
-
-		pidToEnrich
-			.forEach(
-				a -> {
-					Optional<Tuple2<Double, Author>> simAuhtor = base
-						.stream()
-						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
-						.max(Comparator.comparing(Tuple2::_1));
-					if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
-						Author r = simAuhtor.get()._2();
-						r.getPid().add(a._1());
-					}
-				});
-	}
-
-	public static String createEntityPath(final String basePath, final String entityType) {
-		return String.format("%s/%s", basePath, entityType);
-	}
-
-	public static String createSimRelPath(final String basePath, final String entityType) {
-		return String.format("%s/%s/simRel", basePath, entityType);
-	}
-
-	public static String createMergeRelPath(final String basePath, final String entityType) {
-		return String.format("%s/%s/mergeRel", basePath, entityType);
-	}
-
-	private static Double sim(Author a, Author b) {
-
-		final Person pa = parse(a);
-		final Person pb = parse(b);
-
-		if (pa.isAccurate() & pb.isAccurate()) {
-			return new JaroWinkler()
-				.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
-		} else {
-			return new JaroWinkler()
-				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
-		}
-	}
-
-	private static String normalize(final String s) {
-		return nfd(s)
-			.toLowerCase()
-			// do not compact the regexes in a single expression, would cause StackOverflowError
-			// in case
-			// of large input strings
-			.replaceAll("(\\W)+", " ")
-			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-			.replaceAll("(\\p{Punct})+", " ")
-			.replaceAll("(\\d)+", " ")
-			.replaceAll("(\\n)+", " ")
-			.trim();
-	}
-
-	private static String nfd(final String s) {
-		return Normalizer.normalize(s, Normalizer.Form.NFD);
-	}
-
-	private static Person parse(Author author) {
-		if (StringUtils.isNotBlank(author.getSurname())) {
-			return new Person(author.getSurname() + ", " + author.getName(), false);
-		} else {
-			return new Person(author.getFullname(), false);
-		}
-	}
-
-	private static int countAuthorsPids(List<Author> authors) {
-		if (authors == null)
-			return 0;
-
-		return (int) authors.stream().filter(DedupUtility::hasPid).count();
-	}
-
-	private static int authorsSize(List<Author> authors) {
-		if (authors == null)
-			return 0;
-		return authors.size();
-	}
-
-	private static boolean hasPid(Author a) {
-		if (a == null || a.getPid() == null || a.getPid().size() == 0)
-			return false;
-		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java
@ -1,182 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import java.util.*;
-import java.util.stream.Collectors;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.function.PairFlatMapFunction;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.util.LongAccumulator;
-
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.util.BlockProcessor;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import scala.Serializable;
-import scala.Tuple2;
-
-public class Deduper implements Serializable {
-
-	private static final Log log = LogFactory.getLog(Deduper.class);
-
-	/**
-	 * @return the list of relations generated by the deduplication
-	 * @param: the spark context
-	 * @param: list of JSON entities to be deduped
-	 * @param: the dedup configuration
-	 */
-	public static JavaPairRDD<String, String> dedup(
-		JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
-
-		Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
-
-		// create vertexes of the graph: <ID, MapDocument>
-		JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
-
-		// create blocks for deduplication
-		JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
-
-		// create relations by comparing only elements in the same group
-		return computeRelations(context, blocks, config);
-
-		// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new
-		// Edge<>(it._1().hashCode(),
-		// it._2().hashCode(), "equalTo")).rdd();
-		//
-		// RDD<Tuple2<Object, MapDocument>> vertexes =
-		// mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t ->
-		// new
-		// Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
-		// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
-		//
-		// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
-	}
-
-	/**
-	 * @return the list of relations generated by the deduplication
-	 * @param: the spark context
-	 * @param: list of blocks
-	 * @param: the dedup configuration
-	 */
-	public static JavaPairRDD<String, String> computeRelations(
-		JavaSparkContext context,
-		JavaPairRDD<String, Iterable<MapDocument>> blocks,
-		DedupConfig config) {
-
-		Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
-
-		return blocks
-			.flatMapToPair(
-				(PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
-					final SparkReporter reporter = new SparkReporter(accumulators);
-					new BlockProcessor(config).process(it._1(), it._2(), reporter);
-					return reporter.getRelations().iterator();
-				})
-			.mapToPair(
-				(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
-					item._1() + item._2(), item))
-			.reduceByKey((a, b) -> a)
-			.mapToPair(
-				(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
-	}
-
-	/**
-	 * @return the list of blocks based on clustering of dedup configuration
-	 * @param: the spark context
-	 * @param: list of entities: <id, entity>
-	 * @param: the dedup configuration
-	 */
-	public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(
-		JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
-		return mapDocs
-			// the reduce is just to be sure that we haven't document with same id
-			.reduceByKey((a, b) -> a)
-			.map(Tuple2::_2)
-			// Clustering: from <id, doc> to List<groupkey,doc>
-			.flatMapToPair(
-				(PairFlatMapFunction<MapDocument, String, MapDocument>) a -> DedupUtility
-					.getGroupingKeys(config, a)
-					.stream()
-					.map(it -> new Tuple2<>(it, a))
-					.collect(Collectors.toList())
-					.iterator())
-			.groupByKey();
-	}
-
-	public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(
-		JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
-		final String of = config.getWf().getOrderField();
-		final int maxQueueSize = config.getWf().getGroupMaxSize();
-		return mapDocs
-			// the reduce is just to be sure that we haven't document with same id
-			.reduceByKey((a, b) -> a)
-			.map(Tuple2::_2)
-			// Clustering: from <id, doc> to List<groupkey,doc>
-			.flatMapToPair(
-				(PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a -> DedupUtility
-					.getGroupingKeys(config, a)
-					.stream()
-					.map(
-						it -> {
-							List<MapDocument> tmp = new ArrayList<>();
-							tmp.add(a);
-							return new Tuple2<>(it, tmp);
-						})
-					.collect(Collectors.toList())
-					.iterator())
-			.reduceByKey(
-				(Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
-					v1.addAll(v2);
-					v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
-					if (v1.size() > maxQueueSize)
-						return new ArrayList<>(v1.subList(0, maxQueueSize));
-					return v1;
-				});
-	}
-
-	/**
-	 * @return the list of vertexes: <id, mapDocument>
-	 * @param: the spark context
-	 * @param: list of JSON entities
-	 * @param: the dedup configuration
-	 */
-	public static JavaPairRDD<String, MapDocument> mapToVertexes(
-		JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
-
-		return entities
-			.mapToPair(
-				(PairFunction<String, String, MapDocument>) s -> {
-					MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
-					return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
-				});
-	}
-
-	public static JavaPairRDD<String, String> computeRelations2(
-		JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
-		Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
-
-		return blocks
-			.flatMapToPair(
-				(PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
-					try {
-						final SparkReporter reporter = new SparkReporter(accumulators);
-						new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
-						return reporter.getRelations().iterator();
-					} catch (Exception e) {
-						throw new RuntimeException(it._2().get(0).getIdentifier(), e);
-					}
-				})
-			.mapToPair(
-				(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
-					item._1() + item._2(), item))
-			.reduceByKey((a, b) -> a)
-			.mapToPair(
-				(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java
@ -1,6 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-public enum OafEntityType {
-	datasource, organization, project, dataset, otherresearchproduct, software, publication
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -1,112 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.FlatMapFunction;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.graphx.Edge;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SparkSession;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.hash.Hashing;
-
-import eu.dnetlib.dedup.graph.ConnectedComponent;
-import eu.dnetlib.dedup.graph.GraphProcessor;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import scala.Tuple2;
-
-public class SparkCreateConnectedComponent {
-
-	public static void main(String[] args) throws Exception {
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateConnectedComponent.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkCreateConnectedComponent.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-
-		final String inputPath = parser.get("sourcePath");
-		final String entity = parser.get("entity");
-		final String targetPath = parser.get("targetPath");
-
-		final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
-
-		final JavaPairRDD<Object, String> vertexes = spark
-			.read()
-			.load(inputPath + "/" + entity)
-			.as(Encoders.kryo(Oaf.class))
-			.map((MapFunction<Oaf, String>) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING())
-			.javaRDD()
-			.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
-			.mapToPair(
-				(PairFunction<String, Object, String>) s -> new Tuple2<Object, String>(getHashcode(s), s));
-
-		final Dataset<Relation> similarityRelations = spark
-			.read()
-			.load(DedupUtility.createSimRelPath(targetPath, entity))
-			.as(Encoders.bean(Relation.class));
-		final RDD<Edge<String>> edgeRdd = similarityRelations
-			.javaRDD()
-			.map(
-				it -> new Edge<>(
-					getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass()))
-			.rdd();
-		final JavaRDD<ConnectedComponent> cc = GraphProcessor
-			.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
-			.toJavaRDD();
-		final Dataset<Relation> mergeRelation = spark
-			.createDataset(
-				cc
-					.filter(k -> k.getDocIds().size() > 1)
-					.flatMap(
-						(FlatMapFunction<ConnectedComponent, Relation>) c -> c
-							.getDocIds()
-							.stream()
-							.flatMap(
-								id -> {
-									List<Relation> tmp = new ArrayList<>();
-									Relation r = new Relation();
-									r.setSource(c.getCcId());
-									r.setTarget(id);
-									r.setRelClass(ModelConstants.MERGES);
-									tmp.add(r);
-									r = new Relation();
-									r.setTarget(c.getCcId());
-									r.setSource(id);
-									r.setRelClass(ModelConstants.IS_MERGED_IN);
-									tmp.add(r);
-									return tmp.stream();
-								})
-							.iterator())
-					.rdd(),
-				Encoders.bean(Relation.class));
-		mergeRelation
-			.write()
-			.mode("overwrite")
-			.save(DedupUtility.createMergeRelPath(targetPath, entity));
-	}
-
-	public static long getHashcode(final String id) {
-		return Hashing.murmur3_128().hashString(id).asLong();
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
@ -1,59 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.OafEntity;
-import eu.dnetlib.pace.config.DedupConfig;
-
-public class SparkCreateDedupRecord {
-	public static void main(String[] args) throws Exception {
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateDedupRecord.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkCreateDedupRecord.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		final String sourcePath = parser.get("sourcePath");
-		final String entity = parser.get("entity");
-		final String dedupPath = parser.get("dedupPath");
-		final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
-
-		final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory
-			.createDedupRecord(
-				sc,
-				spark,
-				DedupUtility.createMergeRelPath(dedupPath, entity),
-				DedupUtility.createEntityPath(sourcePath, entity),
-				OafEntityType.valueOf(entity),
-				dedupConf);
-		spark
-			.createDataset(dedupRecord.rdd(), Encoders.kryo(OafEntity.class))
-			.write()
-			.mode(SaveMode.Overwrite)
-			.save(dedupPath + "/" + entity + "/dedup_records");
-//
-//
-//		dedupRecord
-//			.map(
-//				r -> {
-//					ObjectMapper mapper = new ObjectMapper();
-//					return mapper.writeValueAsString(r);
-//				})
-//			.saveAsTextFile(dedupPath + "/" + entity + "/dedup_records");
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -1,92 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import java.util.List;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SparkSession;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import scala.Tuple2;
-
-/**
- * This Spark class creates similarity relations between entities, saving result
- * <p>
- * param request: sourcePath entityType target Path
- */
-public class SparkCreateSimRels {
-
-	public static void main(String[] args) throws Exception {
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateSimRels.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkCreateSimRels.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		final String inputPath = parser.get("sourcePath");
-		final String entity = parser.get("entity");
-		final String targetPath = parser.get("targetPath");
-		// final DedupConfig dedupConf =
-		// DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
-		final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
-
-		JavaPairRDD<String, MapDocument> mapDocument = spark
-			.read()
-			.load(inputPath + "/" + entity)
-			.as(Encoders.kryo(Oaf.class))
-			.map((MapFunction<Oaf, String>) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING())
-			.javaRDD()
-			.repartition(1000)
-			.mapToPair(
-				s -> {
-					MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
-					return new Tuple2<>(d.getIdentifier(), d);
-				});
-
-		// create blocks for deduplication
-		JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
-		// JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc,
-		// mapDocument, dedupConf);
-
-		// create relations by comparing only elements in the same group
-		final JavaPairRDD<String, String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
-		// final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks,
-		// dedupConf);
-
-		final JavaRDD<Relation> isSimilarToRDD = dedupRels
-			.map(
-				simRel -> {
-					final Relation r = new Relation();
-					r.setSource(simRel._1());
-					r.setTarget(simRel._2());
-					r.setRelClass("isSimilarTo");
-					return r;
-				});
-
-		spark
-			.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class))
-			.write()
-			.mode("overwrite")
-			.save(DedupUtility.createSimRelPath(targetPath, entity));
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java
@ -1,52 +0,0 @@
-
-package eu.dnetlib.dedup;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.spark.util.LongAccumulator;
-
-import eu.dnetlib.pace.util.Reporter;
-import scala.Serializable;
-import scala.Tuple2;
-
-public class SparkReporter implements Serializable, Reporter {
-
-	final List<Tuple2<String, String>> relations = new ArrayList<>();
-	private static final Log log = LogFactory.getLog(SparkReporter.class);
-	Map<String, LongAccumulator> accumulators;
-
-	public SparkReporter(Map<String, LongAccumulator> accumulators) {
-		this.accumulators = accumulators;
-	}
-
-	public void incrementCounter(
-		String counterGroup,
-		String counterName,
-		long delta,
-		Map<String, LongAccumulator> accumulators) {
-
-		final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
-		if (accumulators.containsKey(accumulatorName)) {
-			accumulators.get(accumulatorName).add(delta);
-		}
-	}
-
-	@Override
-	public void incrementCounter(String counterGroup, String counterName, long delta) {
-
-		incrementCounter(counterGroup, counterName, delta, accumulators);
-	}
-
-	@Override
-	public void emit(String type, String from, String to) {
-		relations.add(new Tuple2<>(from, to));
-	}
-
-	public List<Tuple2<String, String>> getRelations() {
-		return relations;
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
@ -1,84 +0,0 @@
-
-package eu.dnetlib.dedup.graph;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Set;
-
-import org.apache.commons.lang.StringUtils;
-import org.codehaus.jackson.annotate.JsonIgnore;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dedup.DedupUtility;
-import eu.dnetlib.pace.util.PaceException;
-
-public class ConnectedComponent implements Serializable {
-
-	private Set<String> docIds;
-	private String ccId;
-
-	public ConnectedComponent() {
-	}
-
-	public ConnectedComponent(Set<String> docIds) {
-		this.docIds = docIds;
-		createID();
-	}
-
-	public String createID() {
-		if (docIds.size() > 1) {
-			final String s = getMin();
-			String prefix = s.split("\\|")[0];
-			ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s);
-			return ccId;
-		} else {
-			return docIds.iterator().next();
-		}
-	}
-
-	@JsonIgnore
-	public String getMin() {
-
-		final StringBuilder min = new StringBuilder();
-		docIds
-			.forEach(
-				i -> {
-					if (StringUtils.isBlank(min.toString())) {
-						min.append(i);
-					} else {
-						if (min.toString().compareTo(i) > 0) {
-							min.setLength(0);
-							min.append(i);
-						}
-					}
-				});
-		return min.toString();
-	}
-
-	@Override
-	public String toString() {
-		ObjectMapper mapper = new ObjectMapper();
-		try {
-			return mapper.writeValueAsString(this);
-		} catch (IOException e) {
-			throw new PaceException("Failed to create Json: ", e);
-		}
-	}
-
-	public Set<String> getDocIds() {
-		return docIds;
-	}
-
-	public void setDocIds(Set<String> docIds) {
-		this.docIds = docIds;
-	}
-
-	public String getCcId() {
-		return ccId;
-	}
-
-	public void setCcId(String ccId) {
-		this.ccId = ccId;
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
@ -1,37 +0,0 @@
-package eu.dnetlib.dedup.graph
-
-import org.apache.spark.graphx._
-import org.apache.spark.rdd.RDD
-
-import scala.collection.JavaConversions;
-
-object GraphProcessor {
-
-  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
-    val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
-    val cc = graph.connectedComponents(maxIterations).vertices
-
-    val joinResult = vertexes.leftOuterJoin(cc).map {
-      case (id, (openaireId, cc)) => {
-        if (cc.isEmpty) {
-          (id, openaireId)
-        }
-        else {
-          (cc.get, openaireId)
-        }
-      }
-    }
-    val connectedComponents = joinResult.groupByKey()
-      .map[ConnectedComponent](cc => asConnectedComponent(cc))
-    connectedComponents
-  }
-
-
-
-  def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
-    val docs = group._2.toSet[String]
-    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
-    connectedComponent
-  }
-
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java
@ -1,78 +0,0 @@
-
-package eu.dnetlib.dedup.sx;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.scholexplorer.OafUtils;
-import scala.Tuple2;
-
-public class SparkPropagateRelationsJob {
-
-	public static void main(String[] args) throws Exception {
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkPropagateRelationsJob.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkUpdateEntityJob.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-
-		final String relationPath = parser.get("relationPath");
-		final String mergeRelPath = parser.get("mergeRelPath");
-		final String targetRelPath = parser.get("targetRelPath");
-
-		final Dataset<Relation> merge = spark
-			.read()
-			.load(mergeRelPath)
-			.as(Encoders.bean(Relation.class))
-			.where("relClass == 'merges'");
-
-		final Dataset<Relation> rels = spark
-			.read()
-			.load(relationPath)
-			.as(Encoders.kryo(Relation.class))
-			.map(
-				(MapFunction<Relation, Relation>) r -> r,
-				Encoders.bean(Relation.class));
-
-		final Dataset<Relation> firstJoin = rels
-			.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer")
-			.map(
-				(MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
-					final Relation mergeRelation = r._2();
-					final Relation relation = r._1();
-					if (mergeRelation != null)
-						relation.setSource(mergeRelation.getSource());
-					if (relation.getDataInfo() == null)
-						relation.setDataInfo(OafUtils.generateDataInfo("0.9", false));
-					return relation;
-				},
-				Encoders.bean(Relation.class));
-
-		final Dataset<Relation> secondJoin = firstJoin
-			.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer")
-			.map(
-				(MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
-					final Relation mergeRelation = r._2();
-					final Relation relation = r._1();
-					if (mergeRelation != null)
-						relation.setTarget(mergeRelation.getSource());
-					return relation;
-				},
-				Encoders.kryo(Relation.class));
-
-		secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath);
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java
@ -1,102 +0,0 @@
-
-package eu.dnetlib.dedup.sx;
-
-import java.io.IOException;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.sql.*;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import scala.Tuple2;
-
-public class SparkUpdateEntityJob {
-
-	static final String IDJSONPATH = "$.id";
-
-	public static void main(String[] args) throws Exception {
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkUpdateEntityJob.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkUpdateEntityJob.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		final String entityPath = parser.get("entityPath");
-		final String mergeRelPath = parser.get("mergeRelPath");
-		final String dedupRecordPath = parser.get("dedupRecordPath");
-		final String entity = parser.get("entity");
-		final String destination = parser.get("targetPath");
-
-		final Dataset<Relation> df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
-		final JavaPairRDD<String, String> mergedIds = df
-			.where("relClass == 'merges'")
-			.select(df.col("target"))
-			.distinct()
-			.toJavaRDD()
-			.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
-		final JavaRDD<String> sourceEntity = sc.textFile(entityPath);
-
-		final JavaRDD<String> dedupEntity = sc.textFile(dedupRecordPath);
-		JavaPairRDD<String, String> entitiesWithId = sourceEntity
-			.mapToPair(
-				(PairFunction<String, String, String>) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s));
-		Class<? extends Oaf> mainClass;
-		switch (entity) {
-			case "publication":
-				mainClass = DLIPublication.class;
-				break;
-			case "dataset":
-				mainClass = DLIDataset.class;
-				break;
-			case "unknown":
-				mainClass = DLIUnknown.class;
-				break;
-			default:
-				throw new IllegalArgumentException("Illegal type " + entity);
-		}
-		JavaRDD<String> map = entitiesWithId
-			.leftOuterJoin(mergedIds)
-			.map(
-				k -> k._2()._2().isPresent()
-					? updateDeletedByInference(k._2()._1(), mainClass)
-					: k._2()._1());
-		map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class);
-	}
-
-	private static <T extends Oaf> String updateDeletedByInference(
-		final String json, final Class<T> clazz) {
-		final ObjectMapper mapper = new ObjectMapper();
-		mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-		try {
-			Oaf entity = mapper.readValue(json, clazz);
-			if (entity.getDataInfo() == null)
-				entity.setDataInfo(new DataInfo());
-			entity.getDataInfo().setDeletedbyinference(true);
-			return mapper.writeValueAsString(entity);
-		} catch (IOException e) {
-			throw new RuntimeException("Unable to convert json", e);
-		}
-	}
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityWithDedupInfo.scala
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityWithDedupInfo.scala
@ -1,75 +0,0 @@
-package eu.dnetlib.dedup.sx
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication,  DLIUnknown, OafUtils}
-import org.apache.commons.io.IOUtils
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import org.slf4j.LoggerFactory
-import org.apache.spark.sql.functions.col
-
-object SparkUpdateEntityWithDedupInfo {
-
-  def main(args: Array[String]): Unit = {
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityWithDedupInfo.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")))
-    val logger = LoggerFactory.getLogger(SparkUpdateEntityWithDedupInfo.getClass)
-    parser.parseArgument(args)
-
-    val workingPath: String = parser.get("workingPath")
-    logger.info(s"Working dir path = $workingPath")
-
-    implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity]
-    implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
-
-    implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
-    implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
-    implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
-
-
-
-    val spark: SparkSession = SparkSession
-      .builder()
-      .appName(SparkUpdateEntityWithDedupInfo.getClass.getSimpleName)
-      .master(parser.get("master"))
-      .getOrCreate()
-
-
-    val entityPath = parser.get("entityPath")
-    val mergeRelPath = parser.get("mergeRelPath")
-    val dedupRecordPath = parser.get("dedupRecordPath")
-    val entity = parser.get("entity")
-    val destination = parser.get("targetPath")
-
-    val mergedIds = spark.read.load(mergeRelPath).as[Relation]
-      .where("relClass == 'merges'")
-      .select(col("target"))
-
-
-    val entities: Dataset[(String, OafEntity)] = spark
-      .read
-      .load(entityPath).as[OafEntity]
-      .map(o => (o.getId, o))(Encoders.tuple(Encoders.STRING, oafEncoder))
-
-
-    val finalDataset:Dataset[OafEntity] = entities.joinWith(mergedIds, entities("_1").equalTo(mergedIds("target")), "left")
-      .map(k => {
-        val e: OafEntity = k._1._2
-        val t = k._2
-        if (t != null && t.getString(0).nonEmpty) {
-          if (e.getDataInfo == null) {
-            e.setDataInfo(OafUtils.generateDataInfo())
-          }
-          e.getDataInfo.setDeletedbyinference(true)
-        }
-        e
-      })
-
-    val dedupRecords :Dataset[OafEntity] = spark.read.load(dedupRecordPath).as[OafEntity]
-
-    finalDataset.union(dedupRecords)
-      .repartition(1200).write
-      .mode(SaveMode.Overwrite).save(destination)
-
-  }
-
-}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json
@ -1,33 +0,0 @@
-[
-  {
-    "paramName": "mt",
-    "paramLongName": "master",
-    "paramDescription": "should be local or yarn",
-    "paramRequired": true
-  },
-  {
-    "paramName": "s",
-    "paramLongName": "sourcePath",
-    "paramDescription": "the path of the sequential file to read",
-    "paramRequired": true
-  },
-  {
-    "paramName": "e",
-    "paramLongName": "entity",
-    "paramDescription": "the type of entity to be deduped",
-    "paramRequired": true
-  },
-  {
-    "paramName": "c",
-    "paramLongName": "dedupConf",
-    "paramDescription": "dedup configuration to be used",
-    "compressed": true,
-    "paramRequired": true
-  },
-  {
-    "paramName": "d",
-    "paramLongName": "dedupPath",
-    "paramDescription": "dedup path to load mergeRelation",
-    "paramRequired": true
-  }
-]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json
@ -1,38 +0,0 @@
-[
-  {
-    "paramName": "mt",
-    "paramLongName": "master",
-    "paramDescription": "should be local or yarn",
-    "paramRequired": true
-  },
-  {
-    "paramName": "ep",
-    "paramLongName": "entityPath",
-    "paramDescription": "the input entity path",
-    "paramRequired": true
-  },
-  {
-    "paramName": "mr",
-    "paramLongName": "mergeRelPath",
-    "paramDescription": "the input path of merge Rel",
-    "paramRequired": true
-  },
-  {
-    "paramName": "dr",
-    "paramLongName": "dedupRecordPath",
-    "paramDescription": "the inputPath of dedup record",
-    "paramRequired": true
-  },
-  {
-    "paramName": "e",
-    "paramLongName": "entity",
-    "paramDescription": "the type of entity",
-    "paramRequired": true
-  },
-  {
-    "paramName": "t",
-    "paramLongName": "targetPath",
-    "paramDescription": "the targetPath",
-    "paramRequired": true
-  }
-]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json
@ -1,33 +0,0 @@
-[
-  {
-    "paramName": "mt",
-    "paramLongName": "master",
-    "paramDescription": "should be local or yarn",
-    "paramRequired": true
-  },
-  {
-    "paramName": "s",
-    "paramLongName": "sourcePath",
-    "paramDescription": "the path of the sequential file to read",
-    "paramRequired": true
-  },
-  {
-    "paramName": "e",
-    "paramLongName": "entity",
-    "paramDescription": "the type of entity to be deduped",
-    "paramRequired": true
-  },
-  {
-    "paramName": "c",
-    "paramLongName": "dedupConf",
-    "paramDescription": "dedup configuration to be used",
-    "compressed": true,
-    "paramRequired": true
-  },
-  {
-    "paramName": "t",
-    "paramLongName": "targetPath",
-    "paramDescription": "target path to save dedup result",
-    "paramRequired": true
-  }
-]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json
@ -1,26 +0,0 @@
-[
-  {
-    "paramName": "mt",
-    "paramLongName": "master",
-    "paramDescription": "should be local or yarn",
-    "paramRequired": true
-  },
-  {
-    "paramName": "ep",
-    "paramLongName": "relationPath",
-    "paramDescription": "the input relation path",
-    "paramRequired": true
-  },
-  {
-    "paramName": "mr",
-    "paramLongName": "mergeRelPath",
-    "paramDescription": "the input path of merge Rel",
-    "paramRequired": true
-  },
-  {
-    "paramName": "t",
-    "paramLongName": "targetRelPath",
-    "paramDescription": "the output Rel Path",
-    "paramRequired": true
-  }
-]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml
@ -1,182 +0,0 @@
-<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>sourcePath</name>
-            <description>the source path</description>
-        </property>
-        <property>
-            <name>entity</name>
-            <description>the entity that should be processed</description>
-        </property>
-        <property>
-            <name>dedupConf</name>
-            <description>the dedup Configuration</description>
-        </property>
-        <property>
-            <name>targetPath</name>
-            <description>the target path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-    </parameters>
-    <start to="DeleteWorkingPath"/>
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="DeleteWorkingPath">
-        <fs>
-            <delete path='${targetPath}/${entity}'/>
-            <mkdir path="${targetPath}"/>  
-            <mkdir path="${targetPath}/${entity}"/>              
-        </fs>
-        <ok to="CreateSimRels"/>
-       <error to="Kill"/>
-    </action>
-
-    <action name="CreateSimRels">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Similarity Relations</name>
-            <class>eu.dnetlib.dedup.SparkCreateSimRels</class>
-            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory ${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--targetPath</arg><arg>${targetPath}</arg>
-            <arg>--entity</arg><arg>${entity}</arg>
-            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
-        </spark>
-        <ok to="CreateConnectedComponents"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="CreateConnectedComponents">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Connected Components</name>
-            <class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
-            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory ${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--targetPath</arg><arg>${targetPath}</arg>
-            <arg>--entity</arg><arg>${entity}</arg>
-            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
-        </spark>
-        <ok to="CreateDedupRecord"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="CreateDedupRecord">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Dedup Record</name>
-            <class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
-            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory ${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--dedupPath</arg><arg>${targetPath}</arg>
-            <arg>--entity</arg><arg>${entity}</arg>
-            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
-        </spark>
-        <ok to="fixRelation"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="fixRelation">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Propagate Dedup Relations</name>
-            <class>eu.dnetlib.dedup.sx.SparkPropagateRelationsJob</class>
-            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory ${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
-           <arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
-            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
-            <arg>--targetRelPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>
-        </spark>
-        <ok to="updateDeletedByInferenceEntity"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="updateDeletedByInferenceEntity">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Update ${entity} and add DedupRecord</name>
-            <class>eu.dnetlib.dedup.sx.SparkUpdateEntityWithDedupInfo</class>
-            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory ${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
-            <arg>--entityPath</arg><arg>${sourcePath}/${entity}</arg>
-            <arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
-            <arg>--entity</arg><arg>${entity}</arg>
-            <arg>--dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>
-            <arg>--targetPath</arg><arg>${targetPath}/${entity}/updated_record</arg>
-        </spark>
-        <ok to="replaceEntity"/>
-        <error to="Kill"/>
-    </action>
-    <action name="replaceEntity">
-        <fs>
-            <delete path='${sourcePath}/${entity}'/>
-            <delete path='${sourcePath}/relation'/>
-            <move source="${targetPath}/${entity}/updated_relation" target="${sourcePath}/relation" />
-            <move source="${targetPath}/${entity}/updated_record" target="${sourcePath}/${entity}" />
-        </fs>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json
@ -1,378 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "result",
-    "subEntityType": "resulttype",
-    "subEntityValue": "publication",
-    "orderField": "title",
-    "queueMaxSize": "2000",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "200",
-    "rootBuilder": [
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering": [
-      {
-        "name": "ngrampairs",
-        "fields": [
-          "title"
-        ],
-        "params": {
-          "max": "1",
-          "ngramLen": "3"
-        }
-      },
-      {
-        "name": "suffixprefix",
-        "fields": [
-          "title"
-        ],
-        "params": {
-          "max": "1",
-          "len": "3"
-        }
-      }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "pid",
-            "comparator": "jsonListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid"
-            }
-          }
-        ],
-        "threshold": 0.5,
-        "aggregation": "AVG",
-        "positive": "MATCH",
-        "negative": "layer2",
-        "undefined": "layer2",
-        "ignoreUndefined": "true"
-      },
-      "layer2": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "titleVersionMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          },
-          {
-            "field": "authors",
-            "comparator": "sizeMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "AND",
-        "positive": "layer3",
-        "negative": "NO_MATCH",
-        "undefined": "layer3",
-        "ignoreUndefined": "false"
-      },
-      "layer3": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.99,
-        "aggregation": "AVG",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "true"
-      }
-    },
-    "model": [
-      {
-        "name": "pid",
-        "type": "JSON",
-        "path": "$.pid",
-        "overrideMatch": "true"
-      },
-      {
-        "name": "title",
-        "type": "String",
-        "path": "$.title[*].value",
-        "length": 250,
-        "size": 5
-      },
-      {
-        "name": "authors",
-        "type": "List",
-        "path": "$.author[*].fullname",
-        "size": 200
-      },
-      {
-        "name": "resulttype",
-        "type": "String",
-        "path": "$.resulttype.classid"
-      }
-    ],
-    "blacklists": {
-      "title": [
-        "^Inside Front Cover$",
-        "^CORR Insights$",
-        "^Index des notions$",
-        "^Department of Error.$",
-        "^Untitled Item$",
-        "^Department of Error$",
-        "^Tome II : 1598 à 1605$",
-        "^(à l’exception de roi, prince, royauté, pouvoir, image… qui sont omniprésents)$",
-        "^Museen und Ausstellungsinstitute in Nürnberg$",
-        "^Text/Conference Paper$",
-        "^Table des illustrations$",
-        "^An Intimate Insight on Psychopathy and a Novel Hermeneutic Psychological Science$",
-        "^Index des noms$",
-        "^Reply by Authors.$",
-        "^Titelblatt - Inhalt$",
-        "^Index des œuvres,$",
-        "(?i)^Poster presentations$",
-        "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
-        "^Problems with perinatal pathology\\.?$",
-        "(?i)^Cases? of Puerperal Convulsions$",
-        "(?i)^Operative Gyna?ecology$",
-        "(?i)^Mind the gap\\!?\\:?$",
-        "^Chronic fatigue syndrome\\.?$",
-        "^Cartas? ao editor Letters? to the Editor$",
-        "^Note from the Editor$",
-        "^Anesthesia Abstract$",
-        "^Annual report$",
-        "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
-        "(?i)^Graph and Table of Infectious Diseases?$",
-        "^Presentation$",
-        "(?i)^Reviews and Information on Publications$",
-        "(?i)^PUBLIC HEALTH SERVICES?$",
-        "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
-        "(?i)^Adrese autora$",
-        "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
-        "(?i)^Acknowledgement to Referees$",
-        "(?i)^Behçet's disease\\.?$",
-        "(?i)^Isolation and identification of restriction endonuclease.*$",
-        "(?i)^CEREBROVASCULAR DISEASES?.?$",
-        "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
-        "^Event management$",
-        "(?i)^Breakfast and Crohn's disease.*\\.?$",
-        "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
-        "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
-        "^Gushi hakubutsugaku$",
-        "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
-        "^Intestinal spirocha?etosis$",
-        "^Treatment of Rodent Ulcer$",
-        "(?i)^\\W*Cloud Computing\\W*$",
-        "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
-        "^Free Communications, Poster Presentations: Session [A-F]$",
-        "^“The Historical Aspects? of Quackery\\.?”$",
-        "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
-        "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
-        "(?i)^Case Report$",
-        "^Boletín Informativo$",
-        "(?i)^Glioblastoma Multiforme$",
-        "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
-        "^Zaměstnanecké výhody$",
-        "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
-        "(?i)^Carotid body tumours?\\.?$",
-        "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
-        "^Avant-propos$",
-        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
-        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
-        "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
-        "^Viñetas de Cortázar$",
-        "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
-        "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
-        "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
-        "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
-        "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
-        "^Aus der AGMB$",
-        "^Znanstveno-stručni prilozi$",
-        "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
-        "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
-        "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
-        "^Finanční analýza podniku$",
-        "^Financial analysis( of business)?$",
-        "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
-        "^Jikken nihon shūshinsho$",
-        "(?i)^CORONER('|s)(s|') INQUESTS$",
-        "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
-        "(?i)^Consultants' contract(s)?$",
-        "(?i)^Upute autorima$",
-        "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
-        "^Joshi shin kokubun$",
-        "^Kōtō shōgaku dokuhon nōson'yō$",
-        "^Jinjō shōgaku shōka$",
-        "^Shōgaku shūjichō$",
-        "^Nihon joshi dokuhon$",
-        "^Joshi shin dokuhon$",
-        "^Chūtō kanbun dokuhon$",
-        "^Wabun dokuhon$",
-        "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
-        "(?i)^cardiac rehabilitation$",
-        "(?i)^Analytical summary$",
-        "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
-        "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
-        "^Prikazi i osvrti$",
-        "^Rodinný dům s provozovnou$",
-        "^Family house with an establishment$",
-        "^Shinsei chūtō shin kokugun$",
-        "^Pulmonary alveolar proteinosis(\\.?)$",
-        "^Shinshū kanbun$",
-        "^Viñeta(s?) de Rodríguez$",
-        "(?i)^RUBRIKA UREDNIKA$",
-        "^A Matching Model of the Academic Publication Market$",
-        "^Yōgaku kōyō$",
-        "^Internetový marketing$",
-        "^Internet marketing$",
-        "^Chūtō kokugo dokuhon$",
-        "^Kokugo dokuhon$",
-        "^Antibiotic Cover for Dental Extraction(s?)$",
-        "^Strategie podniku$",
-        "^Strategy of an Enterprise$",
-        "(?i)^respiratory disease(s?)(\\.?)$",
-        "^Award(s?) for Gallantry in Civil Defence$",
-        "^Podniková kultura$",
-        "^Corporate Culture$",
-        "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
-        "^Pracovní motivace$",
-        "^Work Motivation$",
-        "^Kaitei kōtō jogaku dokuhon$",
-        "^Konsolidovaná účetní závěrka$",
-        "^Consolidated Financial Statements$",
-        "(?i)^intracranial tumour(s?)$",
-        "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
-        "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
-        "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
-        "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
-        "^Úroveň motivačního procesu jako způsobu vedení lidí$",
-        "^The level of motivation process as a leadership$",
-        "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
-        "(?i)^news and events$",
-        "(?i)^NOVOSTI I DOGAĐAJI$",
-        "^Sansū no gakushū$",
-        "^Posouzení informačního systému firmy a návrh změn$",
-        "^Information System Assessment and Proposal for ICT Modification$",
-        "^Stresové zatížení pracovníků ve vybrané profesi$",
-        "^Stress load in a specific job$",
-        "^Sunday: Poster Sessions, Pt.*$",
-        "^Monday: Poster Sessions, Pt.*$",
-        "^Wednesday: Poster Sessions, Pt.*",
-        "^Tuesday: Poster Sessions, Pt.*$",
-        "^Analýza reklamy$",
-        "^Analysis of advertising$",
-        "^Shōgaku shūshinsho$",
-        "^Shōgaku sansū$",
-        "^Shintei joshi kokubun$",
-        "^Taishō joshi kokubun dokuhon$",
-        "^Joshi kokubun$",
-        "^Účetní uzávěrka a účetní závěrka v ČR$",
-        "(?i)^The \"?Causes\"? of Cancer$",
-        "^Normas para la publicación de artículos$",
-        "^Editor('|s)(s|') [Rr]eply$",
-        "^Editor(’|s)(s|’) letter$",
-        "^Redaktoriaus žodis$",
-        "^DISCUSSION ON THE PRECEDING PAPER$",
-        "^Kōtō shōgaku shūshinsho jidōyō$",
-        "^Shōgaku nihon rekishi$",
-        "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
-        "^Préface$",
-        "^Occupational [Hh]ealth [Ss]ervices.$",
-        "^In Memoriam Professor Toshiyuki TAKESHIMA$",
-        "^Účetní závěrka ve vybraném podniku.*$",
-        "^Financial statements in selected company$",
-        "^Abdominal [Aa]ortic [Aa]neurysms.*$",
-        "^Pseudomyxoma peritonei$",
-        "^Kazalo autora$",
-        "(?i)^uvodna riječ$",
-        "^Motivace jako způsob vedení lidí$",
-        "^Motivation as a leadership$",
-        "^Polyfunkční dům$",
-        "^Multi\\-funkcional building$",
-        "^Podnikatelský plán$",
-        "(?i)^Podnikatelský záměr$",
-        "(?i)^Business Plan$",
-        "^Oceňování nemovitostí$",
-        "^Marketingová komunikace$",
-        "^Marketing communication$",
-        "^Sumario Analítico$",
-        "^Riječ uredništva$",
-        "^Savjetovanja i priredbe$",
-        "^Índice$",
-        "^(Starobosanski nadpisi).*$",
-        "^Vzdělávání pracovníků v organizaci$",
-        "^Staff training in organization$",
-        "^(Life Histories of North American Geometridae).*$",
-        "^Strategická analýza podniku$",
-        "^Strategic Analysis of an Enterprise$",
-        "^Sadržaj$",
-        "^Upute suradnicima$",
-        "^Rodinný dům$",
-        "(?i)^Fami(l)?ly house$",
-        "^Upute autorima$",
-        "^Strategic Analysis$",
-        "^Finanční analýza vybraného podniku$",
-        "^Finanční analýza$",
-        "^Riječ urednika$",
-        "(?i)^Content(s?)$",
-        "(?i)^Inhalt$",
-        "^Jinjō shōgaku shūshinsho jidōyō$",
-        "(?i)^Index$",
-        "^Chūgaku kokubun kyōkasho$",
-        "^Retrato de una mujer$",
-        "^Retrato de un hombre$",
-        "^Kōtō shōgaku dokuhon$",
-        "^Shotōka kokugo$",
-        "^Shōgaku dokuhon$",
-        "^Jinjō shōgaku kokugo dokuhon$",
-        "^Shinsei kokugo dokuhon$",
-        "^Teikoku dokuhon$",
-        "^Instructions to Authors$",
-        "^KİTAP TAHLİLİ$",
-        "^PRZEGLĄD PIŚMIENNICTWA$",
-        "(?i)^Presentación$",
-        "^İçindekiler$",
-        "(?i)^Tabl?e of contents$",
-        "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
-        "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
-        "^Editorial( Board)?$",
-        "(?i)^Editorial \\(English\\)$",
-        "^Editörden$",
-        "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
-        "^(Kiri Karl Morgensternile).*$",
-        "^(\\[Eksliibris Aleksandr).*\\]$",
-        "^(\\[Eksliibris Aleksandr).*$",
-        "^(Eksliibris Aleksandr).*$",
-        "^(Kiri A\\. de Vignolles).*$",
-        "^(2 kirja Karl Morgensternile).*$",
-        "^(Pirita kloostri idaosa arheoloogilised).*$",
-        "^(Kiri tundmatule).*$",
-        "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
-        "^(Eksliibris Nikolai Birukovile).*$",
-        "^(Eksliibris Nikolai Issakovile).*$",
-        "^(WHP Cruise Summary Information of section).*$",
-        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
-        "^(Measurement of the spin\\-dependent structure function).*",
-        "(?i)^.*authors['’′]? reply\\.?$",
-        "(?i)^.*authors['’′]? response\\.?$"
-      ]
-    },
-    "synonyms": {}
-  }
-}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@ -6,7 +6,7 @@ import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.common.ModelConstants
-import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
@ -121,11 +121,11 @@ object DoiBoostMappingUtil {


  def getOpenAccessQualifier():AccessRight = {
-    OafUtils.createAccessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+    OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
  }

  def getRestrictedQualifier():AccessRight = {
-    OafUtils.createAccessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+    OafMapperUtils.accessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
  }


@ -153,7 +153,7 @@ object DoiBoostMappingUtil {
        if (item.openAccess)
          i.setAccessright(getOpenAccessQualifier())
        val ar = getOpenAccessQualifier()
-        publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
+        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
      }
      else {
        hb = ModelConstants.UNKNOWN_REPOSITORY
@ -165,11 +165,11 @@ object DoiBoostMappingUtil {
    if (ar.nonEmpty) {
      if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){
        val ar = getOpenAccessQualifier()
-        publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
+        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
      }
      else {
        val ar = getRestrictedQualifier()
-        publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
+        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
      }
    }
    publication
@ -257,7 +257,7 @@ object DoiBoostMappingUtil {
    di.setInferred(false)
    di.setInvisible(false)
    di.setTrust(trust)
-    di.setProvenanceaction(OafUtils.createQualifier(ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS))
+    di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
    di
  }

@ -265,7 +265,7 @@ object DoiBoostMappingUtil {

  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
    val sp = new StructuredProperty
-    sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName))
+    sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
    sp.setValue(value)
    sp

@ -275,7 +275,7 @@ object DoiBoostMappingUtil {

  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
    val sp = new StructuredProperty
-    sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName))
+    sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
    sp.setValue(value)
    sp.setDataInfo(dataInfo)
    sp
@ -284,7 +284,7 @@ object DoiBoostMappingUtil {

  def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
    val sp = new StructuredProperty
-    sp.setQualifier(OafUtils.createQualifier(classId, schemeId))
+    sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
    sp.setValue(value)
    sp

@ -294,7 +294,7 @@ object DoiBoostMappingUtil {

  def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
    val sp = new StructuredProperty
-    sp.setQualifier(OafUtils.createQualifier(classId, schemeId))
+    sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
    sp.setValue(value)
    sp.setDataInfo(dataInfo)
    sp
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -2,20 +2,19 @@ package eu.dnetlib.doiboost.crossref

 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf._
-import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
+import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
 import eu.dnetlib.dhp.utils.DHPUtils
 import eu.dnetlib.doiboost.DoiBoostMappingUtil._
 import org.apache.commons.lang.StringUtils
 import org.json4s
 import org.json4s.DefaultFormats
-import org.json4s.JsonAST._
+import org.json4s.JsonAST.{JValue, _}
 import org.json4s.jackson.JsonMethods._
 import org.slf4j.{Logger, LoggerFactory}

 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.matching.Regex
-import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
 import java.util

 import eu.dnetlib.doiboost.DoiBoostMappingUtil
@ -183,12 +182,12 @@ case object Crossref2Oaf {

    if(has_review != JNothing) {
      instance.setRefereed(
-        OafUtils.createQualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
+        OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
    }

    instance.setAccessright(getRestrictedQualifier())
-    instance.setInstancetype(OafUtils.createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
-    result.setResourcetype(OafUtils.createQualifier(cobjCategory.substring(0, 4),ModelConstants.DNET_DATA_CITE_RESOURCE))
+    instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+    result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4),   cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))

    instance.setCollectedfrom(createCrossrefCollectedFrom())
    if (StringUtils.isNotBlank(issuedDate)) {
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
@ -21,7 +21,7 @@ object SparkMapDumpIntoOAF {

    val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
@ -1,52 +1,19 @@
 package eu.dnetlib.doiboost.orcid

-import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.oaf.Publication
-import eu.dnetlib.dhp.schema.orcid.OrcidDOI
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}

 object SparkConvertORCIDToOAF {
  val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)

-    def fixORCIDItem(item :ORCIDItem):ORCIDItem = {
-      new ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList)

-  }
-
-
-  def run(spark:SparkSession,sourcePath:String,workingPath:String, targetPath:String):Unit = {
-    import spark.implicits._
+  def run(spark:SparkSession, workingPath:String, targetPath:String) :Unit = {
    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
-
-    val inputRDD:RDD[OrcidAuthor]  = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s))
-
-    spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
-
-    val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null)
-
-    spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
-
-    val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
-
-    val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
-
-    works.joinWith(authors, authors("oid").equalTo(works("oid")))
-      .map(i =>{
-      val doi = i._1.doi
-      var author = i._2
-      (doi, author)
-    }).groupBy(col("_1").alias("doi"))
-      .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem]
-      .map(s => fixORCIDItem(s))
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
-
+    import spark.implicits._
    val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]

    logger.info("Converting ORCID to OAF")
@ -55,7 +22,7 @@ object SparkConvertORCIDToOAF {

  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -65,10 +32,10 @@ object SparkConvertORCIDToOAF {
        .master(parser.get("master")).getOrCreate()


-    val sourcePath = parser.get("sourcePath")
    val workingPath = parser.get("workingPath")
    val targetPath = parser.get("targetPath")
-    run(spark, sourcePath, workingPath, targetPath)
+
+   run(spark,workingPath, targetPath)

  }

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala
@ -0,0 +1,70 @@
+package eu.dnetlib.doiboost.orcid
+
+import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.oa.merge.AuthorMerger
+import eu.dnetlib.dhp.schema.oaf.Publication
+import eu.dnetlib.dhp.schema.orcid.OrcidDOI
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkPreprocessORCID {
+  val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
+
+    def fixORCIDItem(item :ORCIDItem):ORCIDItem = {
+      ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList)
+
+  }
+
+
+  def run(spark:SparkSession,sourcePath:String,workingPath:String):Unit = {
+    import spark.implicits._
+    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
+
+    val inputRDD:RDD[OrcidAuthor]  = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s))
+
+    spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
+
+    val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null)
+
+    spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
+
+    val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
+
+    val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
+
+    works.joinWith(authors, authors("oid").equalTo(works("oid")))
+      .map(i =>{
+        val doi = i._1.doi
+        val author = i._2
+      (doi, author)
+    }).groupBy(col("_1").alias("doi"))
+      .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem]
+      .map(s => fixORCIDItem(s))
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
+  }
+
+  def main(args: Array[String]): Unit = {
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+
+    val sourcePath = parser.get("sourcePath")
+    val workingPath = parser.get("workingPath")
+
+    run(spark, sourcePath, workingPath)
+
+  }
+
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@ -4,6 +4,7 @@ package eu.dnetlib.doiboost.orcidnodoi;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
@ -32,10 +33,7 @@ import com.google.gson.JsonParser;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.dhp.schema.orcid.AuthorData;
-import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
-import eu.dnetlib.dhp.schema.orcid.Work;
-import eu.dnetlib.dhp.schema.orcid.WorkDetail;
+import eu.dnetlib.dhp.schema.orcid.*;
 import eu.dnetlib.doiboost.orcid.json.JsonHelper;
 import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
 import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
@ -111,6 +109,10 @@ public class SparkGenEnrichedOrcidWorks {
 						Encoders.bean(WorkDetail.class));
 				logger.info("Works data loaded: " + workDataset.count());

+				final LongAccumulator warnNotFoundContributors = spark
+					.sparkContext()
+					.longAccumulator("warnNotFoundContributors");
+
 				JavaRDD<Tuple2<String, String>> enrichedWorksRDD = workDataset
 					.joinWith(
 						authorDataset,
@ -119,7 +121,21 @@ public class SparkGenEnrichedOrcidWorks {
 						(MapFunction<Tuple2<WorkDetail, AuthorData>, Tuple2<String, String>>) value -> {
 							WorkDetail w = value._1;
 							AuthorData a = value._2;
-							AuthorMatcher.match(a, w.getContributors());
+							if (w.getContributors() == null
+								|| (w.getContributors() != null && w.getContributors().size() == 0)) {
+								Contributor c = new Contributor();
+								c.setName(a.getName());
+								c.setSurname(a.getSurname());
+								c.setCreditName(a.getCreditName());
+								c.setOid(a.getOid());
+								List<Contributor> contributors = Arrays.asList(c);
+								w.setContributors(contributors);
+								if (warnNotFoundContributors != null) {
+									warnNotFoundContributors.add(1);
+								}
+							} else {
+								AuthorMatcher.match(a, w.getContributors());
+							}
 							return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
 						},
 						Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
@ -172,7 +188,7 @@ public class SparkGenEnrichedOrcidWorks {
 							OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, p))))
 					.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
 					.saveAsNewAPIHadoopFile(
-						workingPath.concat(outputEnrichedWorksPath),
+						outputEnrichedWorksPath,
 						Text.class,
 						Text.class,
 						SequenceFileOutputFormat.class,
@ -180,6 +196,7 @@ public class SparkGenEnrichedOrcidWorks {

 				logger.info("parsedPublications: " + parsedPublications.value().toString());
 				logger.info("enrichedPublications: " + enrichedPublications.value().toString());
+				logger.info("warnNotFoundContributors: " + warnNotFoundContributors.value().toString());
 				logger.info("errorsGeneric: " + errorsGeneric.value().toString());
 				logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
 				logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala
@ -18,7 +18,7 @@ object SparkMapUnpayWallToOAF {

    val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json
@ -0,0 +1,6 @@
+[
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the OAF Orcid transformed",  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source path ",  "paramRequired": false},
+  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",  "paramRequired": true}
+
+]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json
@ -0,0 +1,6 @@
+[
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the OAF Orcid transformed",  "paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath", "paramDescription": "the working path ",  "paramRequired": false},
+  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",  "paramRequired": true}
+
+]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json
@ -0,0 +1,6 @@
+[
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the OAF Orcid transformed",  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source path ",  "paramRequired": false},
+  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",  "paramRequired": true}
+
+]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml
@ -1,101 +0,0 @@
-<workflow-app name="import Crossref from index into HDFS" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-        <property>
-            <name>timestamp</name>
-            <description>Timestamp for incremental Harvesting</description>
-        </property>
-
-    </parameters>
-
-    <start to="ImportCrossRef"/>
-
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="ImportCrossRef">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
-            <arg>-t</arg><arg>${workingPath}/input/crossref/index_update</arg>
-            <arg>-n</arg><arg>${nameNode}</arg>
-            <arg>-ts</arg><arg>${timestamp}</arg>
-        </java>
-        <ok to="GenerateDataset"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="GenerateDataset">
-            <spark xmlns="uri:oozie:spark-action:0.2">
-                <master>yarn-cluster</master>
-                <mode>cluster</mode>
-                <name>ExtractCrossrefToOAF</name>
-                <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
-                <jar>dhp-doiboost-${projectVersion}.jar</jar>
-                <spark-opts>
-                    --executor-memory=${sparkExecutorMemory}
-                    --executor-cores=${sparkExecutorCores}
-                    --driver-memory=${sparkDriverMemory}
-                    --conf spark.sql.shuffle.partitions=3840
-                    ${sparkExtraOPT}
-                </spark-opts>
-                <arg>--workingPath</arg><arg>/data/doiboost/input/crossref</arg>
-                <arg>--master</arg><arg>yarn-cluster</arg>
-            </spark>
-            <ok to="RenameDataset"/>
-            <error to="Kill"/>
-    </action>
-
-    <action name="RenameDataset">
-        <fs>
-            <delete path='${workingPath}/input/crossref/crossref_ds'/>
-            <move source="${workingPath}/input/crossref/crossref_ds_updated"
-                  target="${workingPath}/input/crossref/crossref_ds"/>
-        </fs>
-        <ok to="ConvertCrossrefToOAF"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="ConvertCrossrefToOAF">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>ConvertCrossrefToOAF</name>
-            <class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${workingPath}/input/crossref/crossref_ds</arg>
-            <arg>--targetPath</arg><arg>${workingPath}/process/</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/config-default.xml
@ -1,38 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml
@ -1,96 +0,0 @@
-<workflow-app name="Create DOIBoostActionSet" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>hostedByMapPath</name>
-            <description>the Hosted By Map Path</description>
-        </property>
-        <property>
-            <name>affiliationPath</name>
-            <description>the Affliation Path</description>
-        </property>
-        <property>
-            <name>paperAffiliationPath</name>
-            <description>the paperAffiliation Path</description>
-        </property>
-        <property>
-            <name>workingDirPath</name>
-            <description>the Working Path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-    </parameters>
-
-
-
-    <start to="CreateDOIBoost"/>
-
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-
-
-    <action name="CreateDOIBoost">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create DOIBoost Infospace</name>
-            <class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
-            <arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
-            <arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
-            <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="GenerateActionSet"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="GenerateActionSet">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Generate DOIBoost ActionSet</name>
-            <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--dbPublicationPath</arg><arg>${workingDirPath}/doiBoostPublicationFiltered</arg>
-            <arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
-            <arg>--crossRefRelation</arg><arg>${workingDirPath}/crossrefRelation</arg>
-            <arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
-            <arg>-do</arg><arg>${workingDirPath}/doiBoostOrganization</arg>
-            <arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/config-default.xml
@ -1,42 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>oozie.wf.rerun.failnodes</name>
-        <value>false</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
@ -1,92 +0,0 @@
-<workflow-app name="import MAG into HDFS" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>sourcePath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>targetPath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>workingPath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-    </parameters>
-
-    <start to="ResetWorkingPath"/>
-
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="ResetWorkingPath">
-        <fs>
-            <delete path='${workingPath}'/>
-            <mkdir path='${workingPath}'/>
-        </fs>
-        <ok to="ConvertMagToDataset"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="ConvertMagToDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert Mag to Dataset</name>
-            <class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--targetPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="PreprocessMag"/>
-        <error to="Kill"/>
-    </action>
-
-
-
-    <action name="PreprocessMag">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert Mag to OAF Dataset</name>
-            <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${workingPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}/process</arg>
-            <arg>--targetPath</arg><arg>${targetPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_oaf/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_oaf/oozie_app/workflow.xml
@ -34,7 +34,7 @@
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert ORCID to Dataset</name>
-            <class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
+            <class>eu.dnetlib.doiboost.orcid.SparkPreprocessORCID</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml
@ -7,9 +7,14 @@
        </property>
        <property>
            <name>outputPath</name>
+            <value>/data/orcid_activities_2020/no_doi_dataset_prod/</value>
            <description>path where to store the action set</description>
        </property>
-
+        <property>
+            <name>processOutputPath</name>
+            <value>/data/orcid_activities_2020/process_no_doi_dataset_prod</value>
+            <description>temporary path where to store the action set</description>
+        </property>
        <property>
            <name>spark2GenNoDoiDatasetMaxExecutors</name>
            <value>40</value>
@ -66,7 +71,7 @@
    
    <action name="ResetWorkingPath">
        <fs>
-            <delete path='${workingPath}/no_doi_dataset'/>
+            <delete path='${processOutputPath}'/>
        </fs>
        <ok to="GenOrcidNoDoiDataset"/>
        <error to="Kill"/>
@ -92,7 +97,7 @@
            <arg>--workingPath</arg><arg>${workingPath}/</arg>
            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
            <arg>--orcidDataFolder</arg><arg>last_orcid_dataset</arg>
-            <arg>--outputEnrichedWorksPath</arg><arg>no_doi_dataset</arg>
+            <arg>--outputEnrichedWorksPath</arg><arg>${processOutputPath}</arg>
        </spark>
        <ok to="importOrcidNoDoi"/>
        <error to="Kill"/>
@ -100,7 +105,7 @@

    <action name="importOrcidNoDoi">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
-            <arg>${workingPath}/no_doi_dataset/*</arg>
+            <arg>${processOutputPath}/*</arg>
            <arg>${outputPath}</arg>
        </distcp>
        <ok to="End"/>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
@ -0,0 +1,216 @@
+<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+
+        <!-- Crossref Parameters -->
+        <property>
+            <name>inputPathCrossref</name>
+            <description>the Crossref input path</description>
+        </property>
+        <property>
+            <name>crossrefDumpPath</name>
+            <description>the Crossref dump path</description>
+        </property>
+
+        <!--    MAG Parameters    -->
+        <property>
+            <name>MAGDumpPath</name>
+            <description>the MAG dump working path</description>
+        </property>
+
+        <property>
+            <name>inputPathMAG</name>
+            <description>the MAG working path</description>
+        </property>
+
+
+        <!--    ORCID Parameters    -->
+        <property>
+            <name>inputPathOrcid</name>
+            <description>the ORCID input path</description>
+        </property>
+
+        <property>
+            <name>workingPathOrcid</name>
+            <description>the ORCID working path</description>
+        </property>
+
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="resume_from"/>
+
+    <decision name="resume_from">
+        <switch>
+            <case to="UnpackCrossrefEntries">${wf:conf('resumeFrom') eq 'UnpackCrossrefEntries'}</case>
+            <case to="GenerateCrossrefDataset">${wf:conf('resumeFrom') eq 'GenerateCrossrefDataset'}</case>
+            <case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
+            <case to="ConvertMagToDataset">${wf:conf('resumeFrom') eq 'ConvertMagToDataset'}</case>
+            <case to="PreProcessORCID">${wf:conf('resumeFrom') eq 'PreProcessORCID'}</case>
+            <default to="ImportCrossRef"/>
+        </switch>
+    </decision>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ImportCrossRef">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
+            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
+            <arg>--crossrefFileNameTarGz</arg><arg>${crossrefDumpPath}/crossref.tar.gz</arg>
+            <arg>--workingPath</arg><arg>${crossrefDumpPath}</arg>
+            <arg>--outputPath</arg><arg>${crossrefDumpPath}/files/</arg>
+        </java>
+        <ok to="UnpackCrossrefEntries"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="UnpackCrossrefEntries">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>SparkUnpackCrossrefEntries</name>
+            <class>eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${crossrefDumpPath}/files</arg>
+            <arg>--targetPath</arg><arg>${crossrefDumpPath}/crossref_unpack/</arg>
+
+        </spark>
+        <ok to="GenerateCrossrefDataset"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="GenerateCrossrefDataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>SparkGenerateCrossrefDataset</name>
+            <class>eu.dnetlib.doiboost.crossref.GenerateCrossrefDataset</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=7G
+                --executor-cores=2
+                --driver-memory=7G
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${crossrefDumpPath}/crossref_unpack/</arg>
+            <arg>--targetPath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
+
+        </spark>
+        <ok to="removeFiles"/>
+        <error to="Kill"/>
+    </action>
+
+        <action name="removeFiles">
+            <fs>
+<!--                <delete path="${crossrefDumpPath}/files"/>-->
+                    <delete path="${crossrefDumpPath}/crossref_unpack/"/>
+            </fs>
+            <ok to="ResetMagWorkingPath"/>
+            <error to="Kill"/>
+        </action>
+
+    <!-- MAG SECTION -->
+    <action name="ResetMagWorkingPath">
+        <fs>
+            <delete path="${inputPathMAG}/dataset"/>
+            <delete path="${inputPathMAG}/process"/>
+        </fs>
+        <ok to="ConvertMagToDataset"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ConvertMagToDataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Convert Mag to Dataset</name>
+            <class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${MAGDumpPath}</arg>
+            <arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="PreProcessORCID"/>
+        <error to="Kill"/>
+    </action>
+
+    <!--  ORCID  SECTION -->
+    <action name="PreProcessORCID">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Convert ORCID to Dataset</name>
+            <class>eu.dnetlib.doiboost.orcid.SparkPreprocessORCID</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
+            <arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json
@ -1,7 +1,6 @@
 [
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the path of the Orcid Input file",  "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingPath", "paramDescription": "the working path ",  "paramRequired": false},
-  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the working dir path",                      "paramRequired": true},
-  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",                          "paramRequired": true}
+  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",  "paramRequired": true}

 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Generate DOIBoost ActionSet for PROD" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sparkDriverMemory</name>
@ -17,8 +17,6 @@
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
-
-
        <!-- Itersection Parameters -->
        <property>
            <name>workingPath</name>
@ -40,29 +38,8 @@
            <name>inputPathCrossref</name>
            <description>the Crossref input path</description>
        </property>
-        <property>
-            <name>crossrefDumpPath</name>
-            <description>the Crossref dump path</description>
-        </property>
-<!--        <property>-->
-<!--            <name>crossrefTimestamp</name>-->
-<!--            <description>Timestamp for the Crossref incremental Harvesting</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>esServer</name>-->
-<!--            <description>elasticsearch server url for the Crossref Harvesting</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>esIndex</name>-->
-<!--            <description>elasticsearch index name for the Crossref Harvesting</description>-->
-<!--        </property>-->

        <!--    MAG Parameters    -->
-        <property>
-            <name>MAGDumpPath</name>
-            <description>the MAG dump working path</description>
-        </property>
-
        <property>
            <name>inputPathMAG</name>
            <description>the MAG working path</description>
@ -76,11 +53,6 @@
        </property>

        <!--    ORCID Parameters    -->
-        <property>
-            <name>inputPathOrcid</name>
-            <description>the ORCID input path</description>
-        </property>
-
        <property>
            <name>workingPathOrcid</name>
            <description>the ORCID working path</description>
@ -103,15 +75,12 @@

    <decision name="resume_from">
        <switch>
-            <case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
-            <case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
            <case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
            <case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
-            <case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
+            <case to="ProcessORCID">${wf:conf('resumeFrom') eq 'ProcessORCID'}</case>
            <case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
            <case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
-            <case to="GenerateCrossrefDataset">${wf:conf('resumeFrom') eq 'GenerateCrossrefDataset'}</case>
-            <default to="ImportCrossRef"/>
+            <default to="ConvertCrossrefToOAF"/>
        </switch>
    </decision>

@ -119,170 +88,6 @@
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

-<!--    <action name="ImportCrossRef">-->
-<!--        <java>-->
-<!--            <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>-->
-<!--            <arg>&#45;&#45;targetPath</arg><arg>${inputPathCrossref}/index_update</arg>-->
-<!--            <arg>&#45;&#45;namenode</arg><arg>${nameNode}</arg>-->
-<!--            <arg>&#45;&#45;esServer</arg><arg>${esServer}</arg>-->
-<!--            <arg>&#45;&#45;esIndex</arg><arg>${esIndex}</arg>-->
-<!--            <arg>&#45;&#45;timestamp</arg><arg>${crossrefTimestamp}</arg>-->
-<!--        </java>-->
-<!--        <ok to="GenerateCrossrefDataset"/>-->
-<!--        <error to="Kill"/>-->
-<!--    </action>-->
-
-    <action name="ImportCrossRef">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
-            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
-            <arg>--crossrefFileNameTarGz</arg><arg>${crossrefDumpPath}/crossref.tar.gz</arg>
-            <arg>--workingPath</arg><arg>${crossrefDumpPath}</arg>
-            <arg>--outputPath</arg><arg>${crossrefDumpPath}/files/</arg>
-        </java>
-        <ok to="UnpackCrossrefEntries"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="UnpackCrossrefEntries">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>SparkUnpackCrossrefEntries</name>
-            <class>eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-            <arg>--sourcePath</arg><arg>${crossrefDumpPath}/files</arg>
-            <arg>--targetPath</arg><arg>${crossrefDumpPath}/crossref_unpack/</arg>
-
-        </spark>
-        <ok to="GenerateCrossrefDataset"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="GenerateCrossrefDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>SparkGenerateCrossrefDataset</name>
-            <class>eu.dnetlib.doiboost.crossref.GenerateCrossrefDataset</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=7G
-                --executor-cores=2
-                --driver-memory=7G
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-            <arg>--sourcePath</arg><arg>${crossrefDumpPath}/crossref_unpack/</arg>
-            <arg>--targetPath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
-
-        </spark>
-        <ok to="removeFiles"/>
-        <error to="Kill"/>
-    </action>
-
-        <action name="removeFiles">
-            <fs>
-<!--                <delete path="${crossrefDumpPath}/files"/>-->
-                    <delete path="${crossrefDumpPath}/crossref_unpack/"/>
-            </fs>
-            <ok to="ResetMagWorkingPath"/>
-            <error to="Kill"/>
-        </action>
-
-
-
-    <!-- CROSSREF SECTION -->
-
-<!--    <action name="GenerateCrossrefDataset">-->
-<!--            <spark xmlns="uri:oozie:spark-action:0.2">-->
-<!--                <master>yarn-cluster</master>-->
-<!--                <mode>cluster</mode>-->
-<!--                <name>GenerateCrossrefDataset</name>-->
-<!--                <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>-->
-<!--                <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
-<!--                <spark-opts>-->
-<!--                    &#45;&#45;executor-memory=${sparkExecutorMemory}-->
-<!--                    &#45;&#45;executor-cores=${sparkExecutorCores}-->
-<!--                    &#45;&#45;driver-memory=${sparkDriverMemory}-->
-<!--                    &#45;&#45;conf spark.sql.shuffle.partitions=3840-->
-<!--                    &#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
-<!--                    &#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
-<!--                    &#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
-<!--                    &#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
-<!--                </spark-opts>-->
-<!--                <arg>&#45;&#45;workingPath</arg><arg>${inputPathCrossref}</arg>-->
-<!--                <arg>&#45;&#45;master</arg><arg>yarn-cluster</arg>-->
-<!--            </spark>-->
-<!--            <ok to="RenameDataset"/>-->
-<!--            <error to="Kill"/>-->
-<!--    </action>-->
-
-<!--    <action name="RenameDataset">-->
-<!--        <fs>-->
-<!--            <delete path="${inputPathCrossref}/crossref_ds"/>-->
-<!--            <move source="${inputPathCrossref}/crossref_ds_updated"-->
-<!--                  target="${inputPathCrossref}/crossref_ds"/>-->
-<!--        </fs>-->
-<!--        <ok to="ResetMagWorkingPath"/>-->
-<!--        <error to="Kill"/>-->
-<!--    </action>-->
-
-
-
-    <!-- MAG SECTION -->
-    <action name="ResetMagWorkingPath">
-        <fs>
-            <delete path="${inputPathMAG}/dataset"/>
-            <delete path="${inputPathMAG}/process"/>
-        </fs>
-        <ok to="ConvertMagToDataset"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="ConvertMagToDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert Mag to Dataset</name>
-            <class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${MAGDumpPath}</arg>
-            <arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="ConvertCrossrefToOAF"/>
-        <error to="Kill"/>
-    </action>
-
-
    <action name="ConvertCrossrefToOAF">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
@ -326,7 +131,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
-            <arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
+            <arg>--workingPath</arg><arg>${inputPathMAG}/process_p</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
@ -380,7 +185,6 @@
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
            <arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
            <arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/unpaywall/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/unpaywall/oozie_app/config-default.xml
@ -1,38 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/unpaywall/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/unpaywall/oozie_app/workflow.xml
@ -1,55 +0,0 @@
-<workflow-app name="import UnpayWall into HDFS" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>sourcePath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>targetPath</name>
-            <description>the working dir base path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-    </parameters>
-
-    <start to="PreprocessUW"/>
-
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="PreprocessUW">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert UnpayWall to Dataset</name>
-            <class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}/uw_extracted</arg>
-            <arg>--targetPath</arg><arg>${targetPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala
@ -1,22 +1,15 @@
 package eu.dnetlib.doiboost.mag

-import java.sql.Timestamp
-
-import eu.dnetlib.dhp.schema.oaf.Publication
-import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.api.java.function.MapFunction
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
-import org.junit.jupiter.api.Test
-import org.slf4j.{Logger, LoggerFactory}
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, SparkSession}
+import org.codehaus.jackson.map.ObjectMapper
 import org.junit.jupiter.api.Assertions._
-import org.apache.spark.sql.functions._
+import org.junit.jupiter.api.Test
+import org.json4s.DefaultFormats
+import org.slf4j.{Logger, LoggerFactory}

-import scala.collection.JavaConverters._
+import java.sql.Timestamp
 import scala.io.Source
-import scala.reflect.ClassTag
-import scala.util.matching.Regex



@ -65,14 +58,19 @@ class MAGMappingTest {
  @Test
  def normalizeDoiTest():Unit = {

-    import org.json4s.jackson.Serialization.write
-    import org.json4s.DefaultFormats
+

    implicit val formats = DefaultFormats

-    val conf = new SparkConf().setAppName("test").setMaster("local[2]")
-    val sc = new SparkContext(conf)
-    val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
+    val conf = new SparkConf()
+    conf.setMaster("local[*]")
+    conf.set("spark.driver.host", "localhost")
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .appName(getClass.getSimpleName)
+        .config(conf)
+        .getOrCreate()
    val path = getClass.getResource("magPapers.json").getPath

    import org.apache.spark.sql.Encoders
@ -90,14 +88,19 @@ class MAGMappingTest {
  @Test
  def normalizeDoiTest2():Unit = {

-    import org.json4s.jackson.Serialization.write
    import org.json4s.DefaultFormats

    implicit val formats = DefaultFormats

-    val conf = new SparkConf().setAppName("test").setMaster("local[2]")
-    val sc = new SparkContext(conf)
-    val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
+    val conf = new SparkConf()
+    conf.setMaster("local[*]")
+    conf.set("spark.driver.host", "localhost")
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .appName(getClass.getSimpleName)
+        .config(conf)
+        .getOrCreate()
    val path = getClass.getResource("duplicatedMagPapers.json").getPath

    import org.apache.spark.sql.Encoders
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala
@ -46,7 +46,9 @@ class MappingORCIDToOAFTest {
    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
    import spark.implicits._

-    SparkConvertORCIDToOAF.run( spark,sourcePath, workingPath, targetPath)
+    SparkPreprocessORCID.run( spark,sourcePath, workingPath)
+
+    SparkConvertORCIDToOAF.run(spark, workingPath,targetPath)

    val mapper = new ObjectMapper()

@ -61,6 +63,8 @@ class MappingORCIDToOAFTest {
    assertTrue(oA == p.count())
    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p.first()))

+    spark.close()
+

  }

@ -78,7 +82,7 @@ class MappingORCIDToOAFTest {
    val oaf = ORCIDToOAF.convertTOOAF(orcid)
    assert(oaf.getPid.size() == 1)
    oaf.getPid.toList.foreach(pid => assert(pid.getQualifier.getClassid.equals("doi")))
-    oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876".toLowerCase())))
+    oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
    //println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))


--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -71,6 +71,8 @@ public abstract class AbstractMdRecordToOafMapper {

 	private final boolean shouldHashId;

+	private final boolean forceOriginalId;
+
 	protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
 	protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
 	protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@ -98,11 +100,20 @@ public abstract class AbstractMdRecordToOafMapper {
 		nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
 	}

+	protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
+		final boolean shouldHashId, final boolean forceOriginalId) {
+		this.vocs = vocs;
+		this.invisible = invisible;
+		this.shouldHashId = shouldHashId;
+		this.forceOriginalId = forceOriginalId;
+	}
+
 	protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
 		final boolean shouldHashId) {
 		this.vocs = vocs;
 		this.invisible = invisible;
 		this.shouldHashId = shouldHashId;
+		this.forceOriginalId = false;
 	}

 	public List<Oaf> processMdRecord(final String xml) {
@ -190,10 +201,16 @@ public abstract class AbstractMdRecordToOafMapper {
 		final long lastUpdateTimestamp) {

 		final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
-		final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
-		if (!id.equals(entity.getId())) {
-			entity.getOriginalId().add(entity.getId());
-			entity.setId(id);
+
+		final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
+		originalId.add(entity.getId());
+		entity.setOriginalId(Lists.newArrayList(originalId));
+
+		if (!forceOriginalId) {
+			final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
+			if (!id.equals(entity.getId())) {
+				entity.setId(id);
+			}
 		}

 		final List<Oaf> oafs = Lists.newArrayList(entity);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -163,11 +163,13 @@ public class GenerateEntitiesApplication {

 		switch (type.toLowerCase()) {
 			case "oaf-store-cleaned":
-			case "oaf-store-claim":
 				return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
+			case "oaf-store-claim":
+				return new OafToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
 			case "odf-store-cleaned":
-			case "odf-store-claim":
 				return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
+			case "odf-store-claim":
+				return new OdfToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
 			case "oaf-store-intersection":
 				return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
 			case "odf-store-intersection":
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@ -27,6 +27,11 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;

 public class OafToOafMapper extends AbstractMdRecordToOafMapper {

+	public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
+		final boolean forceOrginalId) {
+		super(vocs, invisible, shouldHashId, forceOrginalId);
+	}
+
 	public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
 		super(vocs, invisible, shouldHashId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -22,6 +22,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {

 	public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";

+	public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
+		final boolean forceOrginalId) {
+		super(vocs, invisible, shouldHashId, forceOrginalId);
+	}
+
 	public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
 		super(vocs, invisible, shouldHashId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala
@ -0,0 +1,31 @@
+package eu.dnetlib.dhp.oa.sx.graphimport
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.SparkSession
+
+object SparkDataciteToOAF {
+
+
+  def main(args: Array[String]): Unit = {
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+    import spark.implicits._
+
+
+    val sc = spark.sparkContext
+
+    val inputPath = parser.get("inputPath")
+
+
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
@ -1,176 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-import eu.dnetlib.dhp.oa.merge.AuthorMerger
-import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
-import org.apache.spark.sql.{Encoder, Encoders}
-import org.apache.spark.sql.expressions.Aggregator
-
-
-
-object EBIAggregator {
-
-  def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
-
-    override def zero: OafDataset = new OafDataset()
-
-    override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
-      b.mergeFrom(a._2)
-      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-
-    override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
-      wx.mergeFrom(wy)
-      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: OafDataset): OafDataset = reduction
-
-    override def bufferEncoder: Encoder[OafDataset] =
-      Encoders.kryo(classOf[OafDataset])
-
-    override def outputEncoder: Encoder[OafDataset] =
-      Encoders.kryo(classOf[OafDataset])
-  }
-
-  def getDLIUnknownAggregator(): Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown] = new Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown]{
-
-    override def zero: DLIUnknown = new DLIUnknown()
-
-    override def reduce(b: DLIUnknown, a: (String, DLIUnknown)): DLIUnknown = {
-      b.mergeFrom(a._2)
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-    override def merge(wx: DLIUnknown, wy: DLIUnknown): DLIUnknown = {
-      wx.mergeFrom(wy)
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: DLIUnknown): DLIUnknown = reduction
-
-    override def bufferEncoder: Encoder[DLIUnknown] =
-      Encoders.kryo(classOf[DLIUnknown])
-
-    override def outputEncoder: Encoder[DLIUnknown] =
-      Encoders.kryo(classOf[DLIUnknown])
-  }
-
-  def getDLIDatasetAggregator(): Aggregator[(String, DLIDataset), DLIDataset, DLIDataset] = new Aggregator[(String, DLIDataset), DLIDataset, DLIDataset]{
-
-    override def zero: DLIDataset = new DLIDataset()
-
-    override def reduce(b: DLIDataset, a: (String, DLIDataset)): DLIDataset = {
-      b.mergeFrom(a._2)
-      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-    override def merge(wx: DLIDataset, wy: DLIDataset): DLIDataset = {
-      wx.mergeFrom(wy)
-      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: DLIDataset): DLIDataset = reduction
-
-    override def bufferEncoder: Encoder[DLIDataset] =
-      Encoders.kryo(classOf[DLIDataset])
-
-    override def outputEncoder: Encoder[DLIDataset] =
-      Encoders.kryo(classOf[DLIDataset])
-  }
-
-
-  def getDLIPublicationAggregator(): Aggregator[(String, DLIPublication), DLIPublication, DLIPublication] = new Aggregator[(String, DLIPublication), DLIPublication, DLIPublication]{
-
-    override def zero: DLIPublication = new DLIPublication()
-
-    override def reduce(b: DLIPublication, a: (String, DLIPublication)): DLIPublication = {
-      b.mergeFrom(a._2)
-      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
-
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-
-    override def merge(wx: DLIPublication, wy: DLIPublication): DLIPublication = {
-      wx.mergeFrom(wy)
-      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: DLIPublication): DLIPublication = reduction
-
-    override def bufferEncoder: Encoder[DLIPublication] =
-      Encoders.kryo(classOf[DLIPublication])
-
-    override def outputEncoder: Encoder[DLIPublication] =
-      Encoders.kryo(classOf[DLIPublication])
-  }
-
-
-  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
-
-    override def zero: Publication = new Publication()
-
-    override def reduce(b: Publication, a: (String, Publication)): Publication = {
-      b.mergeFrom(a._2)
-      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-
-    override def merge(wx: Publication, wy: Publication): Publication = {
-      wx.mergeFrom(wy)
-      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: Publication): Publication = reduction
-
-    override def bufferEncoder: Encoder[Publication] =
-      Encoders.kryo(classOf[Publication])
-
-    override def outputEncoder: Encoder[Publication] =
-      Encoders.kryo(classOf[Publication])
-  }
-
-  def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
-
-    override def zero: Relation = new Relation()
-
-    override def reduce(b: Relation, a: (String, Relation)): Relation = {
-      a._2
-    }
-
-
-    override def merge(a: Relation, b: Relation): Relation = {
-      if(b!= null) b else a
-    }
-    override def finish(reduction: Relation): Relation = reduction
-
-    override def bufferEncoder: Encoder[Relation] =
-      Encoders.kryo(classOf[Relation])
-
-    override def outputEncoder: Encoder[Relation] =
-      Encoders.kryo(classOf[Relation])
-  }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
@ -1,247 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.common.ModelConstants
-import eu.dnetlib.dhp.schema.oaf.{Author, Instance, Journal, KeyValue, Oaf, Publication, Relation, Dataset => OafDataset}
-import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, OafUtils, ProvenaceInfo}
-import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal}
-import eu.dnetlib.dhp.utils.DHPUtils
-import eu.dnetlib.scholexplorer.relation.RelationMapper
-import org.apache.commons.io.IOUtils
-import org.apache.spark.SparkConf
-import org.apache.spark.sql._
-import org.json4s
-import org.json4s.DefaultFormats
-import org.json4s.JsonAST.{JField, JObject, JString}
-import org.json4s.jackson.JsonMethods.parse
-import org.apache.spark.sql.functions._
-
-import scala.collection.JavaConverters._
-
-object SparkAddLinkUpdates {
-
-  val relationMapper = RelationMapper.load
-
-
-case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
-
-
-  def generatePubmedDLICollectedFrom(): KeyValue = {
-    OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
-  }
-
-
-
-  def journalToOAF(pj:PMJournal): Journal = {
-    val j = new Journal
-    j.setIssnPrinted(pj.getIssn)
-    j.setVol(pj.getVolume)
-    j.setName(pj.getTitle)
-    j.setIss(pj.getIssue)
-    j.setDataInfo(OafUtils.generateDataInfo())
-    j
-  }
-
-
-  def pubmedTOPublication(input:PMArticle):DLIPublication = {
-
-
-    val dnetPublicationId = s"50|${DHPUtils.md5(s"${input.getPmid}::pmid")}"
-
-    val p = new DLIPublication
-    p.setId(dnetPublicationId)
-    p.setDataInfo(OafUtils.generateDataInfo())
-    p.setPid(List(OafUtils.createSP(input.getPmid.toLowerCase.trim, "pmid", ModelConstants.DNET_PID_TYPES)).asJava)
-    p.setCompletionStatus("complete")
-    val pi = new ProvenaceInfo
-    pi.setId("dli_________::europe_pmc__")
-    pi.setName( "Europe PMC")
-    pi.setCompletionStatus("complete")
-    pi.setCollectionMode("collected")
-    p.setDlicollectedfrom(List(pi).asJava)
-    p.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-
-    if (input.getAuthors != null && input.getAuthors.size() >0) {
-      var aths: List[Author] = List()
-      input.getAuthors.asScala.filter(a=> a!= null).foreach(a => {
-        val c = new Author
-        c.setFullname(a.getFullName)
-        c.setName(a.getForeName)
-        c.setSurname(a.getLastName)
-        aths =  aths ::: List(c)
-      })
-      if (aths.nonEmpty)
-        p.setAuthor(aths.asJava)
-    }
-
-
-    if (input.getJournal != null)
-      p.setJournal(journalToOAF(input.getJournal))
-    p.setTitle(List(OafUtils.createSP(input.getTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava)
-    p.setDateofacceptance(OafUtils.asField(input.getDate))
-    val i = new Instance
-    i.setCollectedfrom(generatePubmedDLICollectedFrom())
-    i.setDateofacceptance(p.getDateofacceptance)
-    i.setUrl(List(s"https://pubmed.ncbi.nlm.nih.gov/${input.getPmid}").asJava)
-    i.setInstancetype(createQualifier("0001", "Article", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
-    p.setInstance(List(i).asJava)
-    p
-  }
-
-
-  def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
-    val pmid :String = input._1
-    val input_json :String = input._2
-    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
-    lazy val json: json4s.JValue = parse(input_json)
-
-
-    val targets:List[EBILinks] = for {
-      JObject(link) <- json \\ "Category" \\ "Link"
-      JField("PublicationDate", JString(pubdate)) <- link
-      JField("RelationshipType", JObject(relationshipType)) <- link
-      JField("Name", JString(relname)) <- relationshipType
-      JField("Target", JObject(target)) <- link
-      JField("Identifier", JObject(identifier)) <- target
-      JField("ID", JString(tpid)) <- identifier
-      JField("IDScheme", JString(tpidtype)) <- identifier
-      JField("IDURL", JString(turl)) <- identifier
-      JField("Title", JString(title)) <- target
-      JField("Publisher", JObject(pub)) <- target
-      JField("Name", JString(publisher)) <- pub
-    } yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
-
-
-
-    val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
-
-    targets.flatMap(l => {
-      val relation = new Relation
-      val inverseRelation = new Relation
-      val targetDnetId =  s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
-      val relInfo = relationMapper.get(l.relation.toLowerCase)
-      val relationSemantic = relInfo.getOriginal
-      val inverseRelationSemantic = relInfo.getInverse
-
-      relation.setSource(dnetPublicationId)
-      relation.setTarget(targetDnetId)
-      relation.setRelClass("datacite")
-      relation.setRelType(relationSemantic)
-      relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-
-      inverseRelation.setSource(targetDnetId)
-      inverseRelation.setTarget(dnetPublicationId)
-      inverseRelation.setRelClass("datacite")
-      inverseRelation.setRelType(inverseRelationSemantic)
-      inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-
-
-
-      val d = new DLIDataset
-      d.setId(targetDnetId)
-      d.setDataInfo(OafUtils.generateDataInfo())
-      d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, ModelConstants.DNET_PID_TYPES)).asJava)
-      d.setCompletionStatus("complete")
-      val pi = new ProvenaceInfo
-      pi.setId("dli_________::europe_pmc__")
-      pi.setName( "Europe PMC")
-      pi.setCompletionStatus("complete")
-      pi.setCollectionMode("collected")
-      d.setDlicollectedfrom(List(pi).asJava)
-      d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-      d.setPublisher(OafUtils.asField(l.publisher))
-      d.setTitle(List(OafUtils.createSP(l.title, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava)
-      d.setDateofacceptance(OafUtils.asField(l.pubdate))
-      val i = new Instance
-      i.setCollectedfrom(generatePubmedDLICollectedFrom())
-      i.setDateofacceptance(d.getDateofacceptance)
-      i.setUrl(List(l.turl).asJava)
-      i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
-      d.setInstance(List(i).asJava)
-      List(relation, inverseRelation, d)
-    })
-  }
-
-
-  def main(args: Array[String]): Unit = {
-    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
-    parser.parseArgument(args)
-    val spark: SparkSession =
-      SparkSession
-        .builder()
-        .config(conf)
-        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
-        .master(parser.get("master")).getOrCreate()
-
-
-    val workingPath = parser.get("workingPath")
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
-    implicit val oafpubEncoder: Encoder[Publication] = Encoders.kryo[Publication]
-    implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
-    implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
-    implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication])
-    implicit val atEncoder: Encoder[Author] = Encoders.kryo(classOf[Author])
-    implicit  val strEncoder:Encoder[String] = Encoders.STRING
-    implicit  val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
-    implicit  val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
-    implicit  val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
-
-
-    val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
-
-    ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
-
-    ds.filter(s => s.isInstanceOf)
-
-
-
-    val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
-
-    oDataset.filter(p =>p.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
-    oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
-
-
-    val idPublicationSolved:Dataset[String] = spark.read.load(s"$workingPath/baseline_links_updates").where(col("links").isNotNull).select("pmid").as[String]
-    val baseline:Dataset[(String, PMArticle)]= spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle].map(p=> (p.getPmid, p))(Encoders.tuple(strEncoder,PMEncoder))
-    idPublicationSolved.joinWith(baseline, idPublicationSolved("pmid").equalTo(baseline("_1"))).map(k => pubmedTOPublication(k._2._2)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_publication")
-
-
-    val pmaDatasets = spark.read.load("/user/sandro.labruzzo/scholix/EBI/ebi_garr/baseline_dataset").as[PMArticle]
-
-    pmaDatasets.map(p => pubmedTOPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_all")
-
-    val pubs: Dataset[(String,Publication)] = spark.read.load("/user/sandro.labruzzo/scholix/EBI/publication").as[Publication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,oafpubEncoder))
-    val pubdate:Dataset[(String,DLIPublication)] = spark.read.load(s"$workingPath/baseline_publication_all").as[DLIPublication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,pubEncoder))
-
-
-
-    pubs.joinWith(pubdate, pubs("_1").equalTo(pubdate("_1"))).map(k => k._2._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_ebi")
-
-
-
-    val dt : Dataset[DLIDataset] = spark.read.load(s"$workingPath/dataset").as[DLIDataset]
-    val update : Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_dataset").as[DLIDataset]
-
-
-    dt.union(update).map(d => (d.getId,d))(Encoders.tuple(Encoders.STRING, datEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset_ebi")
-
-
-    val rel: Dataset[Relation] = spark.read.load(s"$workingPath/relation").as[Relation]
-    val relupdate : Dataset[Relation] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_relation").as[Relation]
-
-
-    rel.union(relupdate)
-      .map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getRelationAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite)
-      .save(s"$workingPath/baseline_relation_ebi")
-
-  }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
@ -1,86 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
-import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
-import eu.dnetlib.scholexplorer.relation.RelationMapper
-import org.apache.commons.io.IOUtils
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
-import org.slf4j.{Logger, LoggerFactory}
-
-import scala.collection.JavaConverters._
-
-object SparkCreateEBIDataFrame {
-
-
-  def main(args: Array[String]): Unit = {
-    val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
-    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
-    parser.parseArgument(args)
-    val spark: SparkSession =
-      SparkSession
-        .builder()
-        .config(conf)
-        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
-        .master(parser.get("master")).getOrCreate()
-
-    val sc = spark.sparkContext
-
-
-    val workingPath = parser.get("workingPath")
-    val relationMapper = RelationMapper.load
-
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
-    implicit val datasetEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
-    implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication])
-    implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
-
-//    logger.info("Extract Publication and relation from publication_xml")
-//    val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
-//    {
-//      new ObjectMapper().readValue(s, classOf[String])
-//    }).flatMap(s => {
-//      val d = new PublicationScholexplorerParser
-//      d.parseObject(s, relationMapper).asScala.iterator})
-//
-//    val mapper = new ObjectMapper()
-//    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
-//    spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
-//
-//    logger.info("Extract Publication and relation from dataset_xml")
-//    val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
-//    {
-//      new ObjectMapper().readValue(s, classOf[String])
-//    }).flatMap(s => {
-//      val d = new DatasetScholexplorerParser
-//      d.parseObject(s, relationMapper).asScala.iterator})
-
-//    spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
-    val dataset: Dataset[DLIDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[DLIDataset]).map(d => d.asInstanceOf[DLIDataset])
-    val publication: Dataset[DLIPublication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[DLIPublication]).map(d => d.asInstanceOf[DLIPublication])
-    val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
-    publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
-
-    dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
-
-    relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getRelationAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
-
-  }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala
@ -1,3 +0,0 @@
-package eu.dnetlib.dhp.sx.graph
-
-case class IdReplace(newId:String, oldId:String) {}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java
@ -1,153 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Objects;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.function.Consumer;
-import java.util.stream.Collectors;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.bson.Document;
-import org.bson.conversions.Bson;
-
-import com.mongodb.DBObject;
-import com.mongodb.MongoClient;
-import com.mongodb.QueryBuilder;
-import com.mongodb.client.FindIterable;
-import com.mongodb.client.MongoCollection;
-import com.mongodb.client.MongoDatabase;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-
-/**
- * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS Mongo database
- * contains information of each MDSTore in two collections: -metadata That contains info like: ID, format, layout,
- * interpretation -metadataManager: that contains info : ID, mongoCollectionName from the metadata collection we filter
- * the ids with Format, layout, and Interpretation from the metadataManager we get the current MONGO collection name
- * which contains metadata XML see function getCurrentId
- * <p>
- * This Job will be called different times in base at the triple we want import, and generates for each triple a
- * sequence file of XML
- */
-public class ImportDataFromMongo {
-	/**
-	 * It requires in input some parameters described on a file
-	 * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json
-	 * <p>
-	 * - the name node - the paht where store HDFS File - the mongo host - the mongo port - the metadata format to
-	 * import - the metadata layout to import - the metadata interpretation to import - the mongo database Name
-	 * <p>
-	 * This params are encoded into args
-	 *
-	 * @param args
-	 * @throws Exception
-	 */
-	public static void main(String[] args) throws Exception {
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					ImportDataFromMongo.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json")));
-		parser.parseArgument(args);
-		final int port = Integer.parseInt(parser.get("dbport"));
-		final String host = parser.get("dbhost");
-
-		final String format = parser.get("format");
-		final String layout = parser.get("layout");
-		final String interpretation = parser.get("interpretation");
-
-		final String dbName = parser.get("dbName");
-		final MongoClient client = new MongoClient(host, port);
-		MongoDatabase database = client.getDatabase(dbName);
-
-		MongoCollection<Document> metadata = database.getCollection("metadata");
-		MongoCollection<Document> metadataManager = database.getCollection("metadataManager");
-		final DBObject query = QueryBuilder
-			.start("format")
-			.is(format)
-			.and("layout")
-			.is(layout)
-			.and("interpretation")
-			.is(interpretation)
-			.get();
-		final List<String> ids = new ArrayList<>();
-		metadata
-			.find((Bson) query)
-			.forEach((Consumer<Document>) document -> ids.add(document.getString("mdId")));
-		List<String> databaseId = ids
-			.stream()
-			.map(it -> getCurrentId(it, metadataManager))
-			.filter(Objects::nonNull)
-			.collect(Collectors.toList());
-
-		final String hdfsuri = parser.get("namenode");
-		// ====== Init HDFS File System Object
-		Configuration conf = new Configuration();
-		// Set FileSystem URI
-		conf.set("fs.defaultFS", hdfsuri);
-		// Because of Maven
-		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
-		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-
-		FileSystem.get(URI.create(hdfsuri), conf);
-		Path hdfswritepath = new Path(parser.get("targetPath"));
-
-		final AtomicInteger counter = new AtomicInteger(0);
-		try (SequenceFile.Writer writer = SequenceFile
-			.createWriter(
-				conf,
-				SequenceFile.Writer.file(hdfswritepath),
-				SequenceFile.Writer.keyClass(IntWritable.class),
-				SequenceFile.Writer.valueClass(Text.class))) {
-			final IntWritable key = new IntWritable(counter.get());
-			final Text value = new Text();
-			databaseId
-				.forEach(
-					id -> {
-						System.out.println("Reading :" + id);
-						MongoCollection<Document> collection = database.getCollection(id);
-						collection
-							.find()
-							.forEach(
-								(Consumer<Document>) document -> {
-									key.set(counter.getAndIncrement());
-									value.set(document.getString("body"));
-
-									if (counter.get() % 10000 == 0) {
-										System.out.println("Added " + counter.get());
-									}
-									try {
-										writer.append(key, value);
-									} catch (IOException e) {
-										throw new RuntimeException(e);
-									}
-								});
-					});
-		}
-	}
-
-	/**
-	 * Return the name of mongo collection giving an MdStore ID
-	 *
-	 * @param mdId The id of the MDStore
-	 * @param metadataManager The collection metadataManager on mongo which contains this information
-	 * @return
-	 */
-	private static String getCurrentId(
-		final String mdId, final MongoCollection<Document> metadataManager) {
-		FindIterable<Document> result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get());
-		final Document item = result.first();
-		return item == null ? null : item.getString("currentId");
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
@ -0,0 +1,53 @@
+package eu.dnetlib.dhp.sx.graph
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.sx.scholix.Scholix
+import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+import org.apache.hadoop.io.compress._
+
+object SparkConvertObjectToJson {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"sourcePath  -> $sourcePath")
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath  -> $targetPath")
+    val objectType = parser.get("objectType")
+    log.info(s"objectType  -> $objectType")
+
+
+    implicit val scholixEncoder :Encoder[Scholix]= Encoders.kryo[Scholix]
+    implicit val summaryEncoder :Encoder[ScholixSummary]= Encoders.kryo[ScholixSummary]
+
+
+    val mapper = new ObjectMapper
+
+    objectType.toLowerCase match {
+      case "scholix" =>
+        log.info("Serialize Scholix")
+        val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
+        d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath,  classOf[GzipCodec])
+      case "summary" =>
+        log.info("Serialize Summary")
+        val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
+        d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath,  classOf[GzipCodec])
+    }
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
@ -0,0 +1,100 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+
+
+object SparkCreateInputGraph {
+
+  def main(args: Array[String]): Unit = {
+
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+
+    val resultObject = List(
+      ("publication", classOf[Publication]),
+      ("dataset", classOf[OafDataset]),
+      ("software", classOf[Software]),
+      ("otherResearchProduct", classOf[OtherResearchProduct])
+
+    )
+
+    implicit  val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+    implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
+    implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
+    implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
+    implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
+    implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
+
+
+
+
+
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"sourcePath  -> $sourcePath")
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath  -> $targetPath")
+
+
+    val oafDs:Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
+
+
+    log.info("Extract Publication")
+    oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication")
+
+    log.info("Extract dataset")
+    oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset")
+
+    log.info("Extract software")
+    oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software")
+
+    log.info("Extract otherResearchProduct")
+    oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct")
+
+    log.info("Extract Relation")
+    oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation")
+
+    resultObject.foreach { r =>
+      log.info(s"Make ${r._1} unique")
+      makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/dedup/${r._1}",spark, r._2)
+    }
+  }
+
+
+  def extractEntities[T <: Oaf ](oafDs:Dataset[Oaf], targetPath:String, clazz:Class[T], log:Logger) :Unit = {
+
+    implicit  val resEncoder: Encoder[T] = Encoders.kryo(clazz)
+    log.info(s"Extract ${clazz.getSimpleName}")
+    oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath)
+  }
+
+
+  def makeDatasetUnique[T <: Result ](sourcePath:String, targetPath:String, spark:SparkSession, clazz:Class[T]) :Unit = {
+    import spark.implicits._
+
+    implicit  val resEncoder: Encoder[T] = Encoders.kryo(clazz)
+
+    val ds:Dataset[T] = spark.read.load(sourcePath).as[T]
+
+    ds.groupByKey(_.getId).reduceGroups{(x,y) =>
+      x.mergeFrom(y)
+      x
+    }.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath)
+
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
@ -0,0 +1,106 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.Relation
+import eu.dnetlib.dhp.schema.sx.scholix.Scholix
+import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
+import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
+import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.count
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkCreateScholix {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    val relationPath = parser.get("relationPath")
+    log.info(s"relationPath  -> $relationPath")
+    val summaryPath = parser.get("summaryPath")
+    log.info(s"summaryPath  -> $summaryPath")
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath  -> $targetPath")
+
+
+    implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
+    implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
+    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
+
+    import spark.implicits._
+
+
+    val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
+      .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
+
+    val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
+      .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
+
+
+    relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
+      .map { input: ((String, Relation), (String, ScholixSummary)) =>
+        val rel: Relation = input._1._2
+        val source: ScholixSummary = input._2._2
+        (rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
+      }(Encoders.tuple(Encoders.STRING, scholixEncoder))
+      .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
+
+    val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
+
+    scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
+      .map { input: ((String, Scholix), (String, ScholixSummary)) =>
+        if (input._2== null) {
+          null
+        } else {
+          val s: Scholix = input._1._2
+          val target: ScholixSummary = input._2._2
+          ScholixUtils.generateCompleteScholix(s, target)
+        }
+      }.filter(s => s!= null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
+
+
+    val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
+
+    scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix]
+      .map(s=> (s.getIdentifier,s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
+      .groupByKey(_._1)
+      .agg(ScholixUtils.scholixAggregator.toColumn)
+      .map(s => s._2)
+      .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
+
+    val scholix_final:Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
+
+    val stats:Dataset[(String,String,Long)]= scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String,String,Long)]
+
+
+    stats
+      .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else  0, if ("publication".equalsIgnoreCase(s._2)) s._3 else  0 ))
+      .groupByKey(_.id)
+      .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset+b.relatedDataset, a.relatedPublication+b.relatedPublication))
+      .map(_._2)
+      .write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities")
+
+    val relatedEntitiesDS:Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication>0 || r.relatedDataset > 0)
+
+    relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map{i =>
+      val re = i._1
+      val sum = i._2._2
+
+      sum.setRelatedDatasets(re.relatedDataset)
+      sum.setRelatedPublications(re.relatedPublication)
+      sum
+    }.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered")
+
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
@ -0,0 +1,42 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.Result
+import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
+import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkCreateSummaryObject {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"sourcePath  -> $sourcePath")
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath  -> $targetPath")
+
+    implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
+
+    implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
+
+
+    val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result]
+
+    ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath)
+
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java
@ -1,126 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SparkSession;
-
-import com.jayway.jsonpath.JsonPath;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import net.minidev.json.JSONArray;
-
-/**
- * This Job extracts a typology of entity and stores it in a new RDD This job is called different times, for each file
- * generated by the Job {@link ImportDataFromMongo} and store the new RDD in a path that should be under a folder:
- * extractedEntities/entity/version1
- * <p>
- * at the end of this process we will have : extractedEntities/dataset/version1 extractedEntities/dataset/version2
- * extractedEntities/dataset/... extractedEntities/publication/version1 extractedEntities/publication/version2
- * extractedEntities/publication/... extractedEntities/unknown/version1 extractedEntities/unknown/version2
- * extractedEntities/unknown/... extractedEntities/relation/version1 extractedEntities/relation/version2
- * extractedEntities/relation/...
- */
-public class SparkExtractEntitiesJob {
-	static final String IDJSONPATH = "$.id";
-	static final String SOURCEJSONPATH = "$.source";
-	static final String TARGETJSONPATH = "$.target";
-
-	public static void main(String[] args) throws Exception {
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkExtractEntitiesJob.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkExtractEntitiesJob.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		final String inputPath = parser.get("sourcePath");
-		final String targetPath = parser.get("targetPath");
-		final String tdir = parser.get("targetDir");
-		final JavaRDD<String> inputRDD = sc.textFile(inputPath);
-
-		List<String> entities = Arrays
-			.stream(parser.get("entities").split(","))
-			.map(String::trim)
-			.collect(Collectors.toList());
-		if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) {
-			// Extract Dataset
-			inputRDD
-				.filter(SparkExtractEntitiesJob::isDataset)
-				.saveAsTextFile(targetPath + "/dataset/" + tdir, GzipCodec.class);
-		}
-		if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) {
-			// Extract Unknown
-			inputRDD
-				.filter(SparkExtractEntitiesJob::isUnknown)
-				.saveAsTextFile(targetPath + "/unknown/" + tdir, GzipCodec.class);
-		}
-
-		if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) {
-			// Extract Relation
-			inputRDD
-				.filter(SparkExtractEntitiesJob::isRelation)
-				.saveAsTextFile(targetPath + "/relation/" + tdir, GzipCodec.class);
-		}
-		if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) {
-			// Extract Relation
-			inputRDD
-				.filter(SparkExtractEntitiesJob::isPublication)
-				.saveAsTextFile(targetPath + "/publication/" + tdir, GzipCodec.class);
-		}
-	}
-
-	public static boolean isDataset(final String json) {
-		final String id = getJPathString(IDJSONPATH, json);
-		if (StringUtils.isBlank(id))
-			return false;
-		return id.startsWith("60|");
-	}
-
-	public static boolean isPublication(final String json) {
-		final String id = getJPathString(IDJSONPATH, json);
-		if (StringUtils.isBlank(id))
-			return false;
-		return id.startsWith("50|");
-	}
-
-	public static boolean isUnknown(final String json) {
-		final String id = getJPathString(IDJSONPATH, json);
-		if (StringUtils.isBlank(id))
-			return false;
-		return id.startsWith("70|");
-	}
-
-	public static boolean isRelation(final String json) {
-		final String source = getJPathString(SOURCEJSONPATH, json);
-		final String target = getJPathString(TARGETJSONPATH, json);
-		return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target);
-	}
-
-	public static String getJPathString(final String jsonPath, final String json) {
-		try {
-			Object o = JsonPath.read(json, jsonPath);
-			if (o instanceof String)
-				return (String) o;
-			if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
-				return (String) ((JSONArray) o).get(0);
-			return "";
-		} catch (Exception e) {
-			return "";
-		}
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala
@ -0,0 +1,90 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.collection.JavaConverters._
+object SparkResolveRelation {
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/resolve_relations_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+
+    val relationPath = parser.get("relationPath")
+    log.info(s"sourcePath  -> $relationPath")
+    val entityPath = parser.get("entityPath")
+    log.info(s"targetPath  -> $entityPath")
+    val workingPath = parser.get("workingPath")
+    log.info(s"workingPath  -> $workingPath")
+
+
+    implicit  val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
+    implicit  val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
+    import spark.implicits._
+    val entities:Dataset[Result] = spark.read.load(s"$entityPath/*").as[Result]
+
+    entities.flatMap(e => e.getPid.asScala
+      .map(p =>
+             convertPidToDNETIdentifier(p.getValue, p.getQualifier.getClassid))
+            .filter(s => s!= null)
+            .map(s => (s,e.getId))
+    ).groupByKey(_._1)
+      .reduceGroups((x,y) => if (x._2.startsWith("50|doi") || x._2.startsWith("50|pmid")) x else y)
+      .map(s =>s._2)
+      .write
+      .mode(SaveMode.Overwrite)
+      .save(s"$workingPath/resolvedPid")
+
+    val rPid:Dataset[(String,String)] = spark.read.load(s"$workingPath/resolvedPid").as[(String,String)]
+
+    val relationDs:Dataset[(String,Relation)] = spark.read.load(relationPath).as[Relation].map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
+
+    relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_1")), "left").map{
+      m =>
+        val sourceResolved = m._2
+        val currentRelation = m._1._2
+        if (sourceResolved!=null && sourceResolved._2.nonEmpty)
+          currentRelation.setSource(sourceResolved._2)
+        currentRelation
+    }.write
+      .mode(SaveMode.Overwrite)
+      .save(s"$workingPath/resolvedSource")
+
+
+    val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/resolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
+    relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_1")), "left").map{
+      m =>
+        val targetResolved = m._2
+        val currentRelation = m._1._2
+        if (targetResolved!=null && targetResolved._2.nonEmpty)
+          currentRelation.setTarget(targetResolved._2)
+        currentRelation
+    }.filter(r => r.getSource.startsWith("50")&& r.getTarget.startsWith("50"))
+      .write
+      .mode(SaveMode.Overwrite)
+      .save(s"$workingPath/resolvedRelation")
+  }
+
+
+
+
+  def convertPidToDNETIdentifier(pid:String, pidType: String):String = {
+    if (pid==null || pid.isEmpty || pidType== null || pidType.isEmpty)
+      null
+    else
+      s"unresolved::${pid.toLowerCase}::${pidType.toLowerCase}"
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java
@ -1,75 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
-
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import scala.Tuple2;
-
-/**
- * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is different from the
- * identifier * associated by the aggregator, this means that some relation points to missing identifier To avoid this
- * problem we store in the model the Id and the OriginalObJIdentifier This jobs extract this pair and creates a Similar
- * relation that will be used in SparkMergeEntities
- */
-public class SparkSXGeneratePidSimlarity {
-
-	static final String IDJSONPATH = "$.id";
-	static final String OBJIDPATH = "$.originalObjIdentifier";
-
-	public static void generateDataFrame(
-		final SparkSession spark,
-		final JavaSparkContext sc,
-		final String inputPath,
-		final String targetPath) {
-
-		final JavaPairRDD<String, String> datasetSimRel = sc
-			.textFile(inputPath + "/dataset/*")
-			.mapToPair(
-				(PairFunction<String, String, String>) k -> new Tuple2<>(
-					DHPUtils.getJPathString(IDJSONPATH, k),
-					DHPUtils.getJPathString(OBJIDPATH, k)))
-			.filter(
-				t -> !StringUtils
-					.substringAfter(t._1(), "|")
-					.equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::")))
-			.distinct();
-
-		final JavaPairRDD<String, String> publicationSimRel = sc
-			.textFile(inputPath + "/publication/*")
-			.mapToPair(
-				(PairFunction<String, String, String>) k -> new Tuple2<>(
-					DHPUtils.getJPathString(IDJSONPATH, k),
-					DHPUtils.getJPathString(OBJIDPATH, k)))
-			.filter(
-				t -> !StringUtils
-					.substringAfter(t._1(), "|")
-					.equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::")))
-			.distinct();
-
-		JavaRDD<Relation> simRel = datasetSimRel
-			.union(publicationSimRel)
-			.map(
-				s -> {
-					final Relation r = new Relation();
-					r.setSource(s._1());
-					r.setTarget(s._2());
-					r.setRelType("similar");
-					return r;
-				});
-		spark
-			.createDataset(simRel.rdd(), Encoders.bean(Relation.class))
-			.distinct()
-			.write()
-			.mode(SaveMode.Overwrite)
-			.save(targetPath + "/pid_simRel");
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java
@ -1,256 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.jayway.jsonpath.JsonPath;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import net.minidev.json.JSONArray;
-import scala.Tuple2;
-
-/**
- * This job is responsible of the creation of RAW Graph It is applied to the different entities generated from
- * {@link SparkExtractEntitiesJob} In case of dataset, publication and Unknown Entities we group all the entities of the
- * same type by their identifier, and then in the reduce phase we merge all the entities. Merge means: -merge all the
- * metadata -merge the collected From values
- * <p>
- * In case of relation we need to make a different work: -Phase 1: Map reduce jobs Map: Get all Relation and emit a key
- * constructed by (source, relType, Target) and the relation itself Reduce: Merge all relations Looking at the javadoc
- * of {@link SparkSXGeneratePidSimlarity} we take the dataset of pid relation and joining by source and target we
- * replace the wrong identifier in the relation with the correct ones. At the end we replace the new Dataset of Relation
- */
-public class SparkScholexplorerCreateRawGraphJob {
-
-	static final String IDJSONPATH = "$.id";
-	static final String SOURCEJSONPATH = "$.source";
-	static final String TARGETJSONPATH = "$.target";
-	static final String RELJSONPATH = "$.relType";
-
-	public static void main(String[] args) throws Exception {
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkScholexplorerCreateRawGraphJob.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json")));
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.config(
-				new SparkConf()
-					.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"))
-			.appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		final String inputPath = parser.get("sourcePath");
-		final String targetPath = parser.get("targetPath");
-		final String entity = parser.get("entity");
-		FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration());
-		List<Path> subFolder = Arrays
-			.stream(fs.listStatus(new Path(inputPath)))
-			.filter(FileStatus::isDirectory)
-			.map(FileStatus::getPath)
-			.collect(Collectors.toList());
-		List<JavaRDD<String>> inputRdd = new ArrayList<>();
-		subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath())));
-		JavaRDD<String> union = sc.emptyRDD();
-		for (JavaRDD<String> item : inputRdd) {
-			union = union.union(item);
-		}
-		switch (entity) {
-			case "dataset":
-				union
-					.mapToPair(
-						(PairFunction<String, String, DLIDataset>) f -> {
-							final String id = getJPathString(IDJSONPATH, f);
-							ObjectMapper mapper = new ObjectMapper();
-							mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-							return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class));
-						})
-					.reduceByKey(
-						(a, b) -> {
-							a.mergeFrom(b);
-							return a;
-						})
-					.map(
-						item -> {
-							ObjectMapper mapper = new ObjectMapper();
-							return mapper.writeValueAsString(item._2());
-						})
-					.saveAsTextFile(targetPath, GzipCodec.class);
-				break;
-			case "publication":
-				union
-					.mapToPair(
-						(PairFunction<String, String, DLIPublication>) f -> {
-							final String id = getJPathString(IDJSONPATH, f);
-							ObjectMapper mapper = new ObjectMapper();
-							mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-							return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class));
-						})
-					.reduceByKey(
-						(a, b) -> {
-							a.mergeFrom(b);
-							return a;
-						})
-					.map(
-						item -> {
-							ObjectMapper mapper = new ObjectMapper();
-							return mapper.writeValueAsString(item._2());
-						})
-					.saveAsTextFile(targetPath, GzipCodec.class);
-				break;
-			case "unknown":
-				union
-					.mapToPair(
-						(PairFunction<String, String, DLIUnknown>) f -> {
-							final String id = getJPathString(IDJSONPATH, f);
-							ObjectMapper mapper = new ObjectMapper();
-							mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-							return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class));
-						})
-					.reduceByKey(
-						(a, b) -> {
-							a.mergeFrom(b);
-							return a;
-						})
-					.map(
-						item -> {
-							ObjectMapper mapper = new ObjectMapper();
-							return mapper.writeValueAsString(item._2());
-						})
-					.saveAsTextFile(targetPath, GzipCodec.class);
-				break;
-			case "relation":
-				SparkSXGeneratePidSimlarity
-					.generateDataFrame(
-						spark, sc, inputPath.replace("/relation", ""), targetPath.replace("/relation", ""));
-				RDD<Relation> rdd = union
-					.mapToPair(
-						(PairFunction<String, String, Relation>) f -> {
-							final String source = getJPathString(SOURCEJSONPATH, f);
-							final String target = getJPathString(TARGETJSONPATH, f);
-							final String reltype = getJPathString(RELJSONPATH, f);
-							ObjectMapper mapper = new ObjectMapper();
-							mapper
-								.configure(
-									DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-							return new Tuple2<>(
-								DHPUtils
-									.md5(
-										String
-											.format(
-												"%s::%s::%s",
-												source.toLowerCase(),
-												reltype.toLowerCase(),
-												target.toLowerCase())),
-								mapper.readValue(f, Relation.class));
-						})
-					.reduceByKey(
-						(a, b) -> {
-							a.mergeFrom(b);
-							return a;
-						})
-					.map(Tuple2::_2)
-					.rdd();
-
-				spark
-					.createDataset(rdd, Encoders.bean(Relation.class))
-					.write()
-					.mode(SaveMode.Overwrite)
-					.save(targetPath);
-				Dataset<Relation> rel_ds = spark.read().load(targetPath).as(Encoders.bean(Relation.class));
-
-				System.out.println("LOADING PATH :" + targetPath.replace("/relation", "") + "/pid_simRel");
-				Dataset<Relation> sim_ds = spark
-					.read()
-					.load(targetPath.replace("/relation", "") + "/pid_simRel")
-					.as(Encoders.bean(Relation.class));
-
-				Dataset<Relation> ids = sim_ds
-					.map(
-						(MapFunction<Relation, Relation>) relation -> {
-							final String type = StringUtils.substringBefore(relation.getSource(), "|");
-							relation
-								.setTarget(
-									String
-										.format(
-											"%s|%s",
-											type, StringUtils.substringAfter(relation.getTarget(), "::")));
-							return relation;
-						},
-						Encoders.bean(Relation.class));
-
-				final Dataset<Relation> firstJoin = rel_ds
-					.joinWith(ids, ids.col("target").equalTo(rel_ds.col("source")), "left_outer")
-					.map(
-						(MapFunction<Tuple2<Relation, Relation>, Relation>) s -> {
-							if (s._2() != null) {
-								s._1().setSource(s._2().getSource());
-							}
-							return s._1();
-						},
-						Encoders.bean(Relation.class));
-
-				Dataset<Relation> secondJoin = firstJoin
-					.joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")), "left_outer")
-					.map(
-						(MapFunction<Tuple2<Relation, Relation>, Relation>) s -> {
-							if (s._2() != null) {
-								s._1().setTarget(s._2().getSource());
-							}
-							return s._1();
-						},
-						Encoders.bean(Relation.class));
-				secondJoin.write().mode(SaveMode.Overwrite).save(targetPath + "_fixed");
-
-				FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration());
-
-				fileSystem.delete(new Path(targetPath), true);
-				fileSystem.rename(new Path(targetPath + "_fixed"), new Path(targetPath));
-		}
-	}
-
-	public static String getJPathString(final String jsonPath, final String json) {
-		try {
-			Object o = JsonPath.read(json, jsonPath);
-			if (o instanceof String)
-				return (String) o;
-			if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
-				return (String) ((JSONArray) o).get(0);
-			return "";
-		} catch (Exception e) {
-			return "";
-		}
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java
@ -1,72 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.FlatMapFunction;
-import org.apache.spark.sql.SparkSession;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
-import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
-import eu.dnetlib.scholexplorer.relation.RelationMapper;
-import scala.Tuple2;
-
-/**
- * This Job read a sequential File containing XML stored in the aggregator and generates an RDD of heterogeneous
- * entities like Dataset, Relation, Publication and Unknown
- */
-public class SparkScholexplorerGraphImporter {
-
-	public static void main(String[] args) throws Exception {
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkScholexplorerGraphImporter.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json")));
-
-		parser.parseArgument(args);
-		final SparkSession spark = SparkSession
-			.builder()
-			.appName(SparkScholexplorerGraphImporter.class.getSimpleName())
-			.master(parser.get("master"))
-			.getOrCreate();
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		final String inputPath = parser.get("sourcePath");
-
-		RelationMapper relationMapper = RelationMapper.load();
-
-		sc
-			.sequenceFile(inputPath, IntWritable.class, Text.class)
-			.map(Tuple2::_2)
-			.map(Text::toString)
-			.repartition(500)
-			.flatMap(
-				(FlatMapFunction<String, Oaf>) record -> {
-					switch (parser.get("entity")) {
-						case "dataset":
-							final DatasetScholexplorerParser d = new DatasetScholexplorerParser();
-							return d.parseObject(record, relationMapper).iterator();
-						case "publication":
-							final PublicationScholexplorerParser p = new PublicationScholexplorerParser();
-							return p.parseObject(record, relationMapper).iterator();
-						default:
-							throw new IllegalArgumentException("wrong values of entities");
-					}
-				})
-			.map(
-				k -> {
-					ObjectMapper mapper = new ObjectMapper();
-					return mapper.writeValueAsString(k);
-				})
-			.saveAsTextFile(parser.get("targetPath"), GzipCodec.class);
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala
@ -1,203 +0,0 @@
-package eu.dnetlib.dhp.sx.graph
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
-import eu.dnetlib.dhp.sx.ebi.EBIAggregator
-import org.apache.commons.io.IOUtils
-import org.apache.commons.lang3.StringUtils
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import org.slf4j.LoggerFactory
-import org.apache.spark.sql.functions.col
-
-
-object SparkSplitOafTODLIEntities {
-
-
-  def getKeyRelation(rel:Relation):String = {
-    s"${rel.getSource}::${rel.getRelType}::${rel.getTarget}"
-
-
-  }
-
-
-  def extract_dataset(spark:SparkSession, workingPath:String) :Unit = {
-
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
-
-    val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf].repartition(4000)
-
-    val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset].repartition(1000)
-
-
-    OAFDataset
-      .filter(s => s != null && s.isInstanceOf[DLIDataset])
-      .map(s =>s.asInstanceOf[DLIDataset])
-      .union(ebi_dataset)
-      .map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
-      .map(p => p._2)
-      .repartition(2000)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
-
-  }
-
-  def extract_publication(spark:SparkSession, workingPath:String) :Unit = {
-
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
-
-    val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
-
-    val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication].repartition(1000)
-
-
-    OAFDataset
-      .filter(s => s != null && s.isInstanceOf[DLIPublication])
-      .map(s =>s.asInstanceOf[DLIPublication])
-      .union(ebi_publication)
-      .map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
-      .map(p => p._2)
-      .repartition(2000)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication")
-
-  }
-
-  def extract_unknown(spark:SparkSession, workingPath:String) :Unit = {
-
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
-
-    val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
-
-    OAFDataset
-      .filter(s => s != null && s.isInstanceOf[DLIUnknown])
-      .map(s =>s.asInstanceOf[DLIUnknown])
-      .map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, unkEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDLIUnknownAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown")
-
-  }
-
-
-  def extract_ids(o:Oaf) :(String, String) = {
-
-    o match {
-      case p: DLIPublication =>
-        val prefix = StringUtils.substringBefore(p.getId, "|")
-        val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
-        (p.getId, s"$prefix|$original")
-      case p: DLIDataset =>
-        val prefix = StringUtils.substringBefore(p.getId, "|")
-        val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
-        (p.getId, s"$prefix|$original")
-      case _ =>null
-    }
-  }
-
-  def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
-
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
-    import spark.implicits._
-
-    val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
-    val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
-
-
-    OAFDataset
-      .filter(o => o.isInstanceOf[Result])
-      .map(extract_ids)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-      .filter(r => r != null)
-      .where("_1 != _2")
-      .select(col("_1").alias("newId"), col("_2").alias("oldId"))
-      .distinct()
-      .map(f => IdReplace(f.getString(0), f.getString(1)))
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/id_replace")
-
-
-    OAFDataset
-      .filter(s => s != null && s.isInstanceOf[Relation])
-      .map(s =>s.asInstanceOf[Relation])
-      .union(ebi_relation)
-      .map(d => (getKeyRelation(d), d))(Encoders.tuple(Encoders.STRING, relEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getRelationAggregator().toColumn)
-      .map(p => p._2)
-      .repartition(4000)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation_unfixed")
-
-
-    val relations = spark.read.load(s"$workingPath/graph/relation_unfixed").as[Relation]
-    val ids = spark.read.load(s"$workingPath/graph/id_replace").as[IdReplace]
-
-    relations
-      .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
-      .joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
-      .map(i =>{
-        val r = i._1._2
-        if (i._2 != null)
-          {
-            val id = i._2.newId
-            r.setSource(id)
-          }
-        r
-      }).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/rel_f_source")
-
-    val rel_source:Dataset[Relation] = spark.read.load(s"$workingPath/graph/rel_f_source").as[Relation]
-
-    rel_source
-      .map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING, relEncoder))
-      .joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
-      .map(i =>{
-        val r:Relation = i._1._2
-        if (i._2 != null)
-        {
-          val id = i._2.newId
-          r.setTarget(id)
-        }
-        r
-      }).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
-
-
-
-  }
-
-
-  def main(args: Array[String]): Unit = {
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
-    val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
-    parser.parseArgument(args)
-
-    val workingPath: String = parser.get("workingPath")
-    val entity:String = parser.get("entity")
-    logger.info(s"Working dir path = $workingPath")
-
-    val spark:SparkSession  = SparkSession
-      .builder()
-      .appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
-      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-      .master(parser.get("master"))
-      .getOrCreate()
-
-
-    entity match {
-      case "publication" => extract_publication(spark, workingPath)
-      case "dataset" => extract_dataset(spark,workingPath)
-      case "relation" => extract_relations(spark, workingPath)
-      case "unknown" => extract_unknown(spark, workingPath)
-    }
-
-
-
-
-
-  }
-
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkXMLToOAFDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkXMLToOAFDataset.scala
@ -1,73 +0,0 @@
-package eu.dnetlib.dhp.sx.graph
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation}
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
-import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
-import eu.dnetlib.scholexplorer.relation.RelationMapper
-import org.apache.commons.io.IOUtils
-import org.apache.hadoop.io.{IntWritable, Text}
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-
-
-/**
- * This new version of the  Job read a sequential File containing XML stored in the aggregator and generates a Dataset OAF of heterogeneous
- * entities like Dataset, Relation, Publication and Unknown
- */
-
-object SparkXMLToOAFDataset {
-
-
-  def main(args: Array[String]): Unit = {
-    val logger = LoggerFactory.getLogger(SparkXMLToOAFDataset.getClass)
-    val conf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkXMLToOAFDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json")))
-    parser.parseArgument(args)
-    val spark =
-      SparkSession
-        .builder()
-        .config(conf)
-        .appName(SparkXMLToOAFDataset.getClass.getSimpleName)
-        .master(parser.get("master")).getOrCreate()
-
-    val sc = spark.sparkContext
-
-    implicit  val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit  val datasetEncoder:Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
-    implicit  val publicationEncoder:Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
-    implicit  val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
-
-    val relationMapper = RelationMapper.load
-
-    val inputPath: String = parser.get("sourcePath")
-    val entity: String = parser.get("entity")
-    val targetPath = parser.get("targetPath")
-
-    logger.info(s"Input path is $inputPath")
-    logger.info(s"Entity path is $entity")
-    logger.info(s"Target Path is $targetPath")
-
-    val scholixRdd:RDD[Oaf] = sc.sequenceFile(inputPath, classOf[IntWritable], classOf[Text])
-      .map(s => s._2.toString)
-      .flatMap(s => {
-        entity match {
-          case "publication" =>
-            val p = new PublicationScholexplorerParser
-            val l =p.parseObject(s, relationMapper)
-            if (l != null) l.asScala  else List()
-          case "dataset" =>
-            val d = new DatasetScholexplorerParser
-            val l =d.parseObject(s, relationMapper)
-            if (l != null) l.asScala  else List()
-        }
-    }).filter(s => s!= null)
-    spark.createDataset(scholixRdd).write.mode(SaveMode.Append).save(targetPath)
-
-  }
-
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
@ -0,0 +1,397 @@
+package eu.dnetlib.dhp.sx.graph.bio
+
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
+import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Instance, KeyValue, Oaf, Relation, StructuredProperty}
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString}
+import org.json4s.jackson.JsonMethods.{compact, parse, render}
+
+import scala.collection.JavaConverters._
+
+object BioDBToOAF {
+
+  case class EBILinkItem(id: Long, links: String) {}
+
+  case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
+
+  case class UniprotDate(date: String, date_info: String) {}
+
+  case class ScholixResolved(pid:String, pidType:String, typology:String, tilte:List[String], datasource:List[String], date:List[String], authors:List[String]){}
+
+  val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
+  val SUBJ_CLASS = "Keywords"
+
+  val DATE_RELATION_KEY = "RelationDate"
+
+  val resolvedURL:Map[String,String] = Map(
+    "genbank"->             "https://www.ncbi.nlm.nih.gov/nuccore/",
+    "ncbi-n" ->             "https://www.ncbi.nlm.nih.gov/nuccore/",
+    "ncbi-wgs" ->           "https://www.ncbi.nlm.nih.gov/nuccore/",
+    "ncbi-p" ->             "https://www.ncbi.nlm.nih.gov/protein/",
+    "ena" ->                "https://www.ebi.ac.uk/ena/browser/view/",
+    "clinicaltrials.gov"->  "https://clinicaltrials.gov/ct2/show/",
+    "onim"->                "https://omim.org/entry/",
+    "refseq"->              "https://www.ncbi.nlm.nih.gov/nuccore/",
+    "geo"->                 "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
+  )
+
+
+  val collectedFromMap: Map[String, KeyValue] = {
+    val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
+    val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
+    val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
+    val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
+    val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
+    val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
+    val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
+    val pubmedCollectedFrom:KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+
+    UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
+    PDBCollectedFrom.setDataInfo(DATA_INFO)
+    ElsevierCollectedFrom.setDataInfo(DATA_INFO)
+    EBICollectedFrom.setDataInfo(DATA_INFO)
+    pubmedCollectedFrom.setDataInfo(DATA_INFO)
+    enaCollectedFrom.setDataInfo(DATA_INFO)
+    ncbiCollectedFrom.setDataInfo(DATA_INFO)
+    springerNatureCollectedFrom.setDataInfo(DATA_INFO)
+
+    Map(
+      "uniprot" -> UNIPROTCollectedFrom,
+      "pdb"-> PDBCollectedFrom,
+      "elsevier" ->ElsevierCollectedFrom,
+      "ebi" ->EBICollectedFrom,
+      "Springer Nature" -> springerNatureCollectedFrom,
+      "NCBI Nucleotide" -> ncbiCollectedFrom,
+      "European Nucleotide Archive" -> enaCollectedFrom,
+      "Europe PMC" -> pubmedCollectedFrom
+    )
+  }
+
+  def crossrefLinksToOaf(input:String):Oaf = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+    val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
+    val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
+
+    val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
+    val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
+
+    val relation_semantic= (json \ "RelationshipType" \ "Name").extract[String]
+
+    val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
+
+    createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type),collectedFromMap("elsevier"),"relationship", relation_semantic, date)
+
+  }
+
+
+  def scholixResolvedToOAF(input:ScholixResolved):Oaf = {
+
+    val d = new Dataset
+
+    d.setPid(
+      List(
+        OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+      ).asJava
+    )
+
+    d.setDataInfo(DATA_INFO)
+
+    val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
+    d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
+
+    if (input.tilte != null && input.tilte.nonEmpty)
+      d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+
+    d.setOriginalId(List(input.pid).asJava)
+    val i = new Instance
+
+    i.setPid(d.getPid)
+
+    if (resolvedURL.contains(input.pidType)) {
+      i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
+    }
+
+    if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
+      i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+    else
+      i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+
+    if (input.datasource == null || input.datasource.isEmpty)
+      return null
+
+    val ds = input.datasource.head
+    d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
+    i.setCollectedfrom(collectedFromMap(ds))
+    d.setInstance(List(i).asJava)
+
+    if (input.authors != null && input.authors.nonEmpty) {
+      val authors = input.authors.map(a =>{
+        val authorOAF = new Author
+        authorOAF.setFullname(a)
+        authorOAF
+      })
+      d.setAuthor(authors.asJava)
+    }
+    if (input.date!= null && input.date.nonEmpty) {
+        val dt = input.date.head
+        i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
+        d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
+      }
+    d
+  }
+
+
+  def uniprotToOAF(input: String): List[Oaf] = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+    val pid = (json \ "pid").extract[String]
+
+    val d = new Dataset
+
+    d.setPid(
+      List(
+        OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+      ).asJava
+    )
+
+    d.setDataInfo(DATA_INFO)
+    d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
+    d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
+
+    val title: String = (json \ "title").extractOrElse[String](null)
+
+    if (title != null)
+      d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+
+    d.setOriginalId(List(pid).asJava)
+    val i = new Instance
+
+    i.setPid(d.getPid)
+    i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
+    i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+
+    i.setCollectedfrom(collectedFromMap("uniprot"))
+    d.setInstance(List(i).asJava)
+
+    val dates: List[UniprotDate] = for {
+      JObject(dateOBJ) <- json \ "dates"
+      JField("date", JString(date)) <- dateOBJ
+      JField("date_info", JString(date_info)) <- dateOBJ
+    } yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
+
+    val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
+
+
+    if (subjects != null) {
+      d.setSubject(
+        subjects.map(s =>
+          OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
+        ).asJava)
+    }
+    var i_date:Option[UniprotDate] = None
+
+    if (dates.nonEmpty) {
+      i_date = dates.find(d => d.date_info.contains("entry version"))
+      if (i_date.isDefined) {
+        i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
+        d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
+      }
+      val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
+        .map(date => OafMapperUtils.structuredProperty(date.date, "UNKNOWN", "UNKNOWN", ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
+      if (relevant_dates != null && relevant_dates.nonEmpty)
+        d.setRelevantdate(relevant_dates.asJava)
+      d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
+    }
+
+
+    val references_pmid: List[String] = for {
+      JObject(reference) <- json \ "references"
+      JField("PubMed", JString(pid)) <- reference
+    } yield pid
+
+    val references_doi: List[String] = for {
+      JObject(reference) <- json \ "references"
+      JField(" DOI", JString(pid)) <- reference
+    } yield pid
+
+
+    if (references_pmid != null && references_pmid.nonEmpty) {
+      val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), "relationship", "isRelatedTo", if  (i_date.isDefined) i_date.get.date else null)
+      rel.getCollectedfrom
+      List(d, rel)
+    }
+    else if (references_doi != null && references_doi.nonEmpty) {
+      val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), "relationship", "isRelatedTo", if  (i_date.isDefined) i_date.get.date else null)
+      List(d, rel)
+    }
+    else
+      List(d)
+  }
+
+
+
+  def generate_unresolved_id(pid:String, pidType:String) :String = {
+    s"unresolved::$pid::$pidType"
+  }
+
+
+  def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType:String, relClass:String, date:String):Relation = {
+
+    val rel = new Relation
+    rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
+    rel.setDataInfo(DATA_INFO)
+
+    rel.setRelType("resultResult")
+    rel.setSubRelType(subRelType)
+    rel.setRelClass(relClass)
+
+    rel.setSource(sourceId)
+    rel.setTarget(s"unresolved::$pid::$pidType")
+
+
+    val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+
+    rel.setProperties(List(dateProps).asJava)
+
+    rel.getTarget.startsWith("unresolved")
+    rel.setCollectedfrom(List(collectedFrom).asJava)
+    rel
+
+  }
+
+
+  def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date:String): Relation = {
+    createRelation(pid,pidType,sourceId,collectedFrom, "supplement","IsSupplementTo", date)
+  }
+
+
+  def pdbTOOaf(input: String): List[Oaf] = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+    val pdb = (json \ "pdb").extract[String].toLowerCase
+
+    if (pdb.isEmpty)
+      return List()
+
+    val d = new Dataset
+
+    d.setPid(
+      List(
+        OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+      ).asJava
+    )
+
+    d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
+    d.setDataInfo(DATA_INFO)
+    d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
+    d.setOriginalId(List(pdb).asJava)
+
+    val title = (json \ "title").extractOrElse[String](null)
+
+    if (title == null)
+      return List()
+    d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+
+    val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
+
+    if (authors != null) {
+      val convertedAuthors = authors.zipWithIndex.map { a =>
+
+        val res = new Author
+        res.setFullname(a._1)
+        res.setRank(a._2 + 1)
+        res
+      }
+
+      d.setAuthor(convertedAuthors.asJava)
+    }
+
+    val i = new Instance
+
+    i.setPid(d.getPid)
+    i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
+    i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+
+    i.setCollectedfrom(collectedFromMap("pdb"))
+    d.setInstance(List(i).asJava)
+
+    val pmid = (json \ "pmid").extractOrElse[String](null)
+
+    if (pmid != null)
+      List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
+    else
+      List(d)
+  }
+
+
+  def extractEBILinksFromDump(input: String): EBILinkItem = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+
+    val pmid = (json \ "publication" \ "pmid").extract[String]
+    val links = (json \ "links").extract[JObject]
+    EBILinkItem(pmid.toLong, compact(render(links)))
+  }
+
+
+  def EBITargetLinksFilter(input: EBILinks): Boolean = {
+
+    input.targetPidType.equalsIgnoreCase("ena") ||   input.targetPidType.equalsIgnoreCase("pdb") ||  input.targetPidType.equalsIgnoreCase("uniprot")
+
+  }
+
+
+  def parse_ebi_links(input: String): List[EBILinks] = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+    val pmid = (json \ "request" \ "id").extract[String]
+    for {
+      JObject(link) <- json \\ "Link"
+      JField("Target", JObject(target)) <- link
+      JField("RelationshipType", JObject(relType)) <- link
+      JField("Name", JString(relation)) <- relType
+      JField("PublicationDate", JString(publicationDate)) <- link
+      JField("Title", JString(title)) <- target
+      JField("Identifier", JObject(identifier)) <- target
+      JField("IDScheme", JString(idScheme)) <- identifier
+      JField("IDURL", JString(idUrl)) <- identifier
+      JField("ID", JString(id)) <- identifier
+
+    } yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
+  }
+
+
+  def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
+    val d = new Dataset
+    d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
+    d.setDataInfo(DATA_INFO)
+    d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+
+    val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
+
+    d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
+    d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
+
+
+    d.setPid(
+      List(
+        OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+      ).asJava
+    )
+
+    val i = new Instance
+
+    i.setPid(d.getPid)
+    i.setUrl(List(input.targetUrl).asJava)
+    i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+
+    i.setCollectedfrom(collectedFromMap("ebi"))
+    d.setInstance(List(i).asJava)
+    i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
+    d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
+
+    List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"),"relationship", "isRelatedTo", GraphCleaningFunctions.cleanDate(input.date)))
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala
@ -0,0 +1,49 @@
+package eu.dnetlib.dhp.sx.graph.bio
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import BioDBToOAF.ScholixResolved
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkTransformBioDatabaseToOAF {
+
+  def main(args: Array[String]): Unit = {
+    val conf: SparkConf = new SparkConf()
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/bio_to_oaf_params.json")))
+    parser.parseArgument(args)
+    val database: String = parser.get("database")
+    log.info("database: {}", database)
+
+    val dbPath: String = parser.get("dbPath")
+    log.info("dbPath: {}", database)
+    val targetPath: String = parser.get("targetPath")
+    log.info("targetPath: {}", database)
+
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+    val sc = spark.sparkContext
+
+    implicit  val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+    import  spark.implicits._
+
+    database.toUpperCase() match {
+      case "UNIPROT" =>
+        spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath)
+      case "PDB"=>
+        spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
+      case "SCHOLIX" =>
+        spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).write.mode(SaveMode.Overwrite).save(targetPath)
+      case "CROSSREF_LINKS"=>
+        spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
+    }
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMArticle.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMArticle.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.sx.ebi.model;
+package eu.dnetlib.dhp.sx.graph.bio.pubmed;

 import java.io.Serializable;
 import java.util.ArrayList;
@ -16,6 +16,7 @@ public class PMArticle implements Serializable {
 	private String language;
 	private final List<PMSubject> subjects = new ArrayList<>();
 	private final List<PMSubject> publicationTypes = new ArrayList<>();
+	private List<PMAuthor> authors = new ArrayList<>();

 	public List<PMSubject> getPublicationTypes() {
 		return publicationTypes;
@ -35,8 +36,6 @@ public class PMArticle implements Serializable {
 		this.doi = doi;
 	}

-	private List<PMAuthor> authors = new ArrayList<>();
-
 	public String getPmid() {
 		return pmid;
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.sx.ebi.model;
+package eu.dnetlib.dhp.sx.graph.bio.pubmed;

 import java.io.Serializable;

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMGrant.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMGrant.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.sx.ebi.model;
+package eu.dnetlib.dhp.sx.graph.bio.pubmed;

 public class PMGrant {

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.sx.ebi.model;
+package eu.dnetlib.dhp.sx.graph.bio.pubmed;

 import java.io.Serializable;

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.sx.ebi.model
+package eu.dnetlib.dhp.sx.graph.bio.pubmed

 import scala.xml.MetaData
 import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.sx.ebi.model;
+package eu.dnetlib.dhp.sx.graph.bio.pubmed;

 public class PMSubject {
 	private String value;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
@ -0,0 +1,151 @@
+package eu.dnetlib.dhp.sx.graph.bio.pubmed
+
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
+import eu.dnetlib.dhp.schema.oaf._
+
+import scala.collection.JavaConverters._
+
+object PubMedToOaf {
+
+  val SUBJ_CLASS = "keywords"
+  val urlMap = Map(
+    "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
+    "doi" -> "https://dx.doi.org/"
+  )
+
+  def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
+    val result_typologies = getVocabularyTerm("dnet:result_typologies", vocabularies, cobjQualifier.getClassid)
+    result_typologies.getClassid match {
+      case "dataset" => new Dataset
+      case "publication" => new Publication
+      case "other" => new OtherResearchProduct
+      case "software" => new Software
+      case _ => null
+
+    }
+  }
+
+  def mapJournal(j: PMJournal): Journal = {
+    if (j == null)
+      return null
+    val journal = new Journal
+
+    journal.setDataInfo(dataInfo)
+    journal.setName(j.getTitle)
+    journal.setVol(j.getVolume)
+    journal.setIssnPrinted(j.getIssn)
+    journal.setIss(j.getIssue)
+    journal
+
+
+  }
+
+
+  def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
+    val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
+    val b = vocabularies.getTermAsQualifier(vocabularyName, term)
+    if (a == null) b else a
+  }
+
+  val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
+  val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+
+  def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
+
+    if (article.getPublicationTypes == null)
+      return null
+    val i = new Instance
+    var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
+    if (pidList == null)
+      return null
+    if (article.getDoi != null) {
+      pidList = pidList ::: List(OafMapperUtils.structuredProperty(article.getDoi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
+    }
+
+    // If the article contains the typology Journal Article then we apply this type
+    //else We have to find a terms that match the vocabulary otherwise we discard it
+    val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
+    if (ja.isDefined) {
+      val cojbCategory = getVocabularyTerm("dnet:publication_resource", vocabularies, ja.get.getValue)
+      i.setInstancetype(cojbCategory)
+    } else {
+      val i_type = article.getPublicationTypes.asScala
+        .map(s => getVocabularyTerm("dnet:publication_resource", vocabularies, s.getValue))
+        .find(q => q != null)
+      if (i_type.isDefined)
+        i.setInstancetype(i_type.get)
+      else
+        return null
+    }
+    val result = createResult(i.getInstancetype, vocabularies)
+    if (result == null)
+      return result
+    result.setDataInfo(dataInfo)
+    i.setPid(pidList.asJava)
+    result.setInstance(List(i).asJava)
+
+
+    i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut)
+    val urlLists: List[String] = pidList
+      .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
+      .filter(t => t._1.nonEmpty)
+      .map(t => t._1 + t._2)
+    if (urlLists != null)
+      i.setUrl(urlLists.asJava)
+    i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
+    i.setCollectedfrom(collectedFrom)
+    result.setPid(pidList.asJava)
+    if (article.getJournal != null && result.isInstanceOf[Publication])
+      result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal))
+    result.setCollectedfrom(List(collectedFrom).asJava)
+
+    result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
+
+    if (article.getTitle == null || article.getTitle.isEmpty)
+      return null
+    result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
+
+    if (article.getDescription != null && article.getDescription.nonEmpty)
+      result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
+
+    if (article.getLanguage != null) {
+
+      val term = vocabularies.getSynonymAsQualifier("dnet:languages", article.getLanguage)
+      if (term != null)
+        result.setLanguage(term)
+    }
+
+
+    val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection breakOut)
+    if (subjects != null)
+      result.setSubject(subjects.asJava)
+
+
+    val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
+      val author = new Author()
+      author.setName(a.getForeName)
+      author.setSurname(a.getLastName)
+      author.setFullname(a.getFullName)
+      author.setRank(index + 1)
+      author
+    }(collection breakOut)
+
+
+    if (authors != null && authors.nonEmpty)
+      result.setAuthor(authors.asJava)
+    result.setOriginalId(pidList.map(s => s.getValue).asJava)
+
+
+    result.setId(article.getPmid)
+
+    val id = IdentifierFactory.createIdentifier(result)
+    if (article.getPmid.equalsIgnoreCase(id))
+      return null
+    result.setId(id)
+    result
+  }
+
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala
@ -1,12 +1,16 @@
-package eu.dnetlib.dhp.sx.ebi
+package eu.dnetlib.dhp.sx.graph.ebi

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.oaf.Result
+import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
+import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
 import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}

 import scala.io.Source
 import scala.xml.pull.XMLEventReader
@ -36,24 +40,37 @@ object SparkCreateBaselineDataFrame {

  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json")))
    parser.parseArgument(args)
+    val isLookupUrl: String = parser.get("isLookupUrl")
+    log.info("isLookupUrl: {}", isLookupUrl)
+    val workingPath = parser.get("workingPath")
+    log.info("workingPath: {}", workingPath)
+
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)
+
+    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
+    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
-        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
+        .appName(SparkEBILinksToOaf.getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    import spark.implicits._


    val sc = spark.sparkContext

-    val workingPath = parser.get("workingPath")
+

    implicit  val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
    implicit  val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
    implicit  val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
+    implicit  val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
+
    val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
    val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
      val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
@ -64,5 +81,13 @@ object SparkCreateBaselineDataFrame {
    ds.map(p => (p.getPmid,p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
      .agg(pmArticleAggregator.toColumn)
      .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
+
+    val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
+    exported_dataset
+      .map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
+      .filter(p => p!= null)
+      .write.mode(SaveMode.Overwrite).save(targetPath)
+
+    //s"$workingPath/oaf/baseline_oaf"
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala
@ -0,0 +1,47 @@
+package eu.dnetlib.dhp.sx.graph.ebi
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import eu.dnetlib.dhp.sx.graph.bio
+import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
+import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+object SparkEBILinksToOaf {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(SparkEBILinksToOaf.getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"sourcePath  -> $sourcePath")
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath  -> $targetPath")
+
+    import spark.implicits._
+    implicit  val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+
+    val ebi_rdd:Dataset[EBILinkItem] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => BioDBToOAF.extractEBILinksFromDump(s))).as[EBILinkItem]
+
+    ebi_rdd.write.mode(SaveMode.Overwrite).save(s"${sourcePath}_dataset")
+
+    val ebLinks:Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links!= null)
+
+    ebLinks.flatMap(j =>BioDBToOAF.parse_ebi_links(j.links))
+      .repartition(4000)
+      .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
+      .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
+      .write.mode(SaveMode.Overwrite).save(targetPath)
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala
@ -1,4 +1,4 @@
-package eu.dnetlib.sx.pangaea
+package eu.dnetlib.dhp.sx.graph.pangaea

 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.{Encoder, Encoders}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
@ -1,7 +1,7 @@
-package eu.dnetlib.sx.pangaea
+package eu.dnetlib.dhp.sx.graph.pangaea

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame
+import eu.dnetlib.dhp.sx.graph.ebi.SparkEBILinksToOaf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
@ -1,223 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph.parser;
-
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-import javax.xml.stream.XMLStreamReader;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
-import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import eu.dnetlib.scholexplorer.relation.RelInfo;
-import eu.dnetlib.scholexplorer.relation.RelationMapper;
-
-public abstract class AbstractScholexplorerParser {
-
-	protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class);
-	static final Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
-	private final List<String> datasetSubTypes = Arrays
-		.asList(
-			"dataset",
-			"software",
-			"film",
-			"sound",
-			"physicalobject",
-			"audiovisual",
-			"collection",
-			"other",
-			"study",
-			"metadata");
-
-	public abstract List<Oaf> parseObject(final String record, final RelationMapper relMapper);
-
-	protected Map<String, String> getAttributes(final XMLStreamReader parser) {
-		final Map<String, String> attributesMap = new HashMap<>();
-		for (int i = 0; i < parser.getAttributeCount(); i++) {
-			attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
-		}
-		return attributesMap;
-	}
-
-	protected List<StructuredProperty> extractSubject(List<VtdUtilityParser.Node> subjects) {
-		final List<StructuredProperty> subjectResult = new ArrayList<>();
-		if (subjects != null && subjects.size() > 0) {
-			subjects
-				.forEach(
-					subjectMap -> {
-						final StructuredProperty subject = new StructuredProperty();
-						subject.setValue(subjectMap.getTextValue());
-						final Qualifier schema = new Qualifier();
-						schema.setClassid("dnet:subject");
-						schema.setClassname("dnet:subject");
-						schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme"));
-						schema.setSchemename(subjectMap.getAttributes().get("subjectScheme"));
-						subject.setQualifier(schema);
-						subjectResult.add(subject);
-					});
-		}
-		return subjectResult;
-	}
-
-	protected StructuredProperty extractIdentifier(
-		List<VtdUtilityParser.Node> identifierType, final String fieldName) {
-		final StructuredProperty pid = new StructuredProperty();
-		if (identifierType != null && identifierType.size() > 0) {
-			final VtdUtilityParser.Node result = identifierType.get(0);
-			pid.setValue(result.getTextValue());
-			final Qualifier pidType = new Qualifier();
-			pidType.setClassname(result.getAttributes().get(fieldName));
-			pidType.setClassid(result.getAttributes().get(fieldName));
-			pidType.setSchemename(ModelConstants.DNET_PID_TYPES);
-			pidType.setSchemeid(ModelConstants.DNET_PID_TYPES);
-			pid.setQualifier(pidType);
-			return pid;
-		}
-		return null;
-	}
-
-	protected void inferPid(final StructuredProperty input) {
-		final Matcher matcher = pattern.matcher(input.getValue());
-		if (matcher.find()) {
-			input.setValue(matcher.group());
-			if (input.getQualifier() == null) {
-				input.setQualifier(new Qualifier());
-				input.getQualifier().setSchemename(ModelConstants.DNET_PID_TYPES);
-				input.getQualifier().setSchemeid(ModelConstants.DNET_PID_TYPES);
-			}
-			input.getQualifier().setClassid("doi");
-			input.getQualifier().setClassname("doi");
-		}
-	}
-
-	protected String generateId(final String pid, final String pidType, final String entityType) {
-		String type;
-		switch (entityType) {
-			case "publication":
-				type = "50|";
-				break;
-			case "dataset":
-				type = "60|";
-				break;
-			case "unknown":
-				type = "70|";
-				break;
-			default:
-				throw new IllegalArgumentException("unexpected value " + entityType);
-		}
-		if ("dnet".equalsIgnoreCase(pidType))
-			return type + StringUtils.substringAfter(pid, "::");
-
-		return type
-			+ DHPUtils
-				.md5(
-					String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
-	}
-
-	protected DLIUnknown createUnknownObject(
-		final String pid,
-		final String pidType,
-		final KeyValue cf,
-		final DataInfo di,
-		final String dateOfCollection) {
-		final DLIUnknown uk = new DLIUnknown();
-		uk.setId(generateId(pid, pidType, "unknown"));
-		ProvenaceInfo pi = new ProvenaceInfo();
-		pi.setId(cf.getKey());
-		pi.setName(cf.getValue());
-		pi.setCompletionStatus("incomplete");
-		uk.setDataInfo(di);
-		uk.setDlicollectedfrom(Collections.singletonList(pi));
-		final StructuredProperty sourcePid = new StructuredProperty();
-		sourcePid.setValue(pid);
-		final Qualifier pt = new Qualifier();
-		pt.setClassname(pidType);
-		pt.setClassid(pidType);
-		pt.setSchemename(ModelConstants.DNET_PID_TYPES);
-		pt.setSchemeid(ModelConstants.DNET_PID_TYPES);
-		sourcePid.setQualifier(pt);
-		uk.setPid(Collections.singletonList(sourcePid));
-		uk.setDateofcollection(dateOfCollection);
-		return uk;
-	}
-
-	protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
-		final String schemeName) {
-		final Qualifier q = new Qualifier();
-		q.setClassid(classId);
-		q.setClassid(className);
-		q.setSchemeid(schemeId);
-		q.setSchemename(schemeName);
-		return q;
-
-	}
-
-	protected void generateRelations(
-		RelationMapper relationMapper,
-		Result parsedObject,
-		List<Oaf> result,
-		DataInfo di,
-		String dateOfCollection,
-		List<VtdUtilityParser.Node> relatedIdentifiers) {
-		if (relatedIdentifiers != null) {
-			result
-				.addAll(
-					relatedIdentifiers
-						.stream()
-						.flatMap(
-							n -> {
-								final List<Relation> rels = new ArrayList<>();
-								Relation r = new Relation();
-								r.setSource(parsedObject.getId());
-								final String relatedPid = n.getTextValue();
-								final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
-								final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
-								String relationSemantic = n.getAttributes().get("relationType");
-								String inverseRelation;
-								final String targetId = generateId(relatedPid, relatedPidType, relatedType);
-								if (relationMapper.containsKey(relationSemantic.toLowerCase())) {
-									RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
-									relationSemantic = relInfo.getOriginal();
-									inverseRelation = relInfo.getInverse();
-								} else {
-									relationSemantic = "Unknown";
-									inverseRelation = "Unknown";
-								}
-								r.setTarget(targetId);
-								r.setRelType(relationSemantic);
-								r.setRelClass("datacite");
-								r.setCollectedfrom(parsedObject.getCollectedfrom());
-								r.setDataInfo(di);
-								rels.add(r);
-								r = new Relation();
-								r.setDataInfo(di);
-								r.setSource(targetId);
-								r.setTarget(parsedObject.getId());
-								r.setRelType(inverseRelation);
-								r.setRelClass("datacite");
-								r.setCollectedfrom(parsedObject.getCollectedfrom());
-								rels.add(r);
-								if ("unknown".equalsIgnoreCase(relatedType))
-									result
-										.add(
-											createUnknownObject(
-												relatedPid,
-												relatedPidType,
-												parsedObject.getCollectedfrom().get(0),
-												di,
-												dateOfCollection));
-								return rels.stream();
-							})
-						.collect(Collectors.toList()));
-		}
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
@ -1,340 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph.parser;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.commons.lang3.StringUtils;
-
-import com.ximpleware.AutoPilot;
-import com.ximpleware.VTDGen;
-import com.ximpleware.VTDNav;
-
-import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
-import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
-import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
-import eu.dnetlib.scholexplorer.relation.RelationMapper;
-
-public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
-	@Override
-	public List<Oaf> parseObject(String record, final RelationMapper relationMapper) {
-		try {
-			final DLIDataset parsedObject = new DLIDataset();
-			final VTDGen vg = new VTDGen();
-			vg.setDoc(record.getBytes());
-			final List<Oaf> result = new ArrayList<>();
-			vg.parse(true);
-
-			final VTDNav vn = vg.getNav();
-			final AutoPilot ap = new AutoPilot(vn);
-
-			DataInfo di = new DataInfo();
-			di.setTrust("0.9");
-			di.setDeletedbyinference(false);
-			di.setInvisible(false);
-			parsedObject.setDataInfo(di);
-
-			parsedObject
-				.setOriginalId(
-					Collections
-						.singletonList(
-							VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
-
-			parsedObject
-				.setOriginalObjIdentifier(
-					VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
-			String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']");
-			parsedObject.setDateofcollection(dateOfCollection);
-
-			final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
-
-			if (StringUtils.isNotBlank(resolvedDate)) {
-				StructuredProperty currentDate = new StructuredProperty();
-				currentDate.setValue(resolvedDate);
-				final Qualifier dateQualifier = new Qualifier();
-				dateQualifier.setClassname("resolvedDate");
-				dateQualifier.setClassid("resolvedDate");
-				dateQualifier.setSchemename("dnet::date");
-				dateQualifier.setSchemeid("dnet::date");
-				currentDate.setQualifier(dateQualifier);
-				parsedObject.setRelevantdate(Collections.singletonList(currentDate));
-			}
-			final String completionStatus = VtdUtilityParser
-				.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
-			final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
-
-			final String publisher = VtdUtilityParser
-				.getSingleValue(
-					ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']");
-
-			List<VtdUtilityParser.Node> collectedFromNodes = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='collectedFrom']",
-					Arrays.asList("name", "id", "mode", "completionStatus"));
-
-			List<VtdUtilityParser.Node> resolvededFromNodes = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='resolvedFrom']",
-					Arrays.asList("name", "id", "mode", "completionStatus"));
-
-			Field<String> pf = new Field<>();
-			pf.setValue(publisher);
-
-			parsedObject.setPublisher(pf);
-			final List<ProvenaceInfo> provenances = new ArrayList<>();
-			if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
-				collectedFromNodes
-					.forEach(
-						it -> {
-							final ProvenaceInfo provenance = new ProvenaceInfo();
-							provenance.setId(it.getAttributes().get("id"));
-							provenance.setName(it.getAttributes().get("name"));
-							provenance.setCollectionMode(provisionMode);
-							provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
-							provenances.add(provenance);
-						});
-			}
-
-			if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
-				resolvededFromNodes
-					.forEach(
-						it -> {
-							final ProvenaceInfo provenance = new ProvenaceInfo();
-							provenance.setId(it.getAttributes().get("id"));
-							provenance.setName(it.getAttributes().get("name"));
-							provenance.setCollectionMode("resolved");
-							provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
-							provenances.add(provenance);
-						});
-			}
-
-			parsedObject.setDlicollectedfrom(provenances);
-			parsedObject
-				.setCollectedfrom(
-					parsedObject
-						.getDlicollectedfrom()
-						.stream()
-						.map(
-							p -> {
-								final KeyValue cf = new KeyValue();
-								cf.setKey(p.getId());
-								cf.setValue(p.getName());
-								return cf;
-							})
-						.collect(Collectors.toList()));
-			parsedObject
-				.setCompletionStatus(
-					VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
-
-			final List<Node> identifierType = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='resource']/*[local-name()='identifier']",
-					Collections.singletonList("identifierType"));
-
-			StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType");
-			if (currentPid == null)
-				return null;
-			inferPid(currentPid);
-			parsedObject.setPid(Collections.singletonList(currentPid));
-
-			String resolvedURL = null;
-
-			switch (currentPid.getQualifier().getClassname().toLowerCase()) {
-				case "uniprot":
-					resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
-					break;
-				case "ena":
-					if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
-						resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
-					break;
-				case "chembl":
-					resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
-					break;
-
-				case "ncbi-n":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "ncbi-p":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "genbank":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "pdb":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "url":
-					resolvedURL = currentPid.getValue();
-					break;
-			}
-
-			final String sourceId = generateId(
-				currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
-			parsedObject.setId(sourceId);
-
-			List<String> descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']");
-			if (descs != null && descs.size() > 0)
-				parsedObject
-					.setDescription(
-						descs
-							.stream()
-//							.map(it -> it.length() < 10000 ? it : it.substring(0, 10000))
-							.map(
-								it -> {
-									final Field<String> d = new Field<>();
-									d.setValue(it);
-									return d;
-								})
-							.collect(Collectors.toList()));
-
-			final List<Node> relatedIdentifiers = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='relatedIdentifier']",
-					Arrays
-						.asList(
-							"relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
-
-			generateRelations(
-				relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers);
-
-			final List<Node> hostedBy = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
-
-			if (hostedBy != null) {
-				parsedObject
-					.setInstance(
-						hostedBy
-							.stream()
-							.map(
-								it -> {
-									final Instance i = new Instance();
-									i.setUrl(Collections.singletonList(currentPid.getValue()));
-									KeyValue h = new KeyValue();
-									i.setHostedby(h);
-									h.setKey(it.getAttributes().get("id"));
-									h.setValue(it.getAttributes().get("name"));
-									return i;
-								})
-							.collect(Collectors.toList()));
-			}
-
-			List<StructuredProperty> subjects = extractSubject(
-				VtdUtilityParser
-					.getTextValuesWithAttributes(
-						ap,
-						vn,
-						"//*[local-name()='resource']//*[local-name()='subject']",
-						Collections.singletonList("subjectScheme")));
-
-			parsedObject.setSubject(subjects);
-
-			Qualifier q = new Qualifier();
-			q.setClassname("dataset");
-			q.setClassid("dataset");
-			q.setSchemename("dataset");
-			q.setSchemeid("dataset");
-			parsedObject.setResulttype(q);
-
-			parsedObject.setCompletionStatus(completionStatus);
-
-			final List<String> creators = VtdUtilityParser
-				.getTextValue(
-					ap,
-					vn,
-					"//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']");
-			if (creators != null && creators.size() > 0) {
-				parsedObject
-					.setAuthor(
-						creators
-							.stream()
-							.map(
-								a -> {
-									final Author author = new Author();
-									author.setFullname(a);
-									return author;
-								})
-							.collect(Collectors.toList()));
-			}
-			final List<String> titles = VtdUtilityParser
-				.getTextValue(
-					ap, vn, "//*[local-name()='resource']//*[local-name()='title']");
-			if (titles != null && titles.size() > 0) {
-				parsedObject
-					.setTitle(
-						titles
-							.stream()
-							.map(
-								t -> {
-									final StructuredProperty st = new StructuredProperty();
-									st.setValue(t);
-									st.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER);
-									return st;
-								})
-							.collect(Collectors.toList()));
-			}
-
-			final List<String> dates = VtdUtilityParser
-				.getTextValue(
-					ap,
-					vn,
-					"//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']");
-
-			if (dates != null && dates.size() > 0) {
-				parsedObject
-					.setRelevantdate(
-						dates
-							.stream()
-							.map(
-								cd -> {
-									StructuredProperty date = new StructuredProperty();
-									date.setValue(cd);
-									final Qualifier dq = new Qualifier();
-									dq.setClassname("date");
-									dq.setClassid("date");
-									dq.setSchemename("dnet::date");
-									dq.setSchemeid("dnet::date");
-									date.setQualifier(dq);
-									return date;
-								})
-							.collect(Collectors.toList()));
-			}
-
-			// TERRIBLE HACK TO AVOID EMPTY COLLECTED FROM
-			if (parsedObject.getDlicollectedfrom() == null) {
-
-				final KeyValue cf = new KeyValue();
-				cf.setKey("dli_________::europe_pmc__");
-				cf.setValue("Europe PMC");
-				parsedObject.setCollectedfrom(Collections.singletonList(cf));
-			}
-
-			if (StringUtils.isNotBlank(resolvedURL)) {
-				Instance i = new Instance();
-				i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
-				i.setUrl(Collections.singletonList(resolvedURL));
-				parsedObject.setInstance(Collections.singletonList(i));
-			}
-
-			result.add(parsedObject);
-			return result;
-		} catch (Throwable e) {
-			log.error("Error on parsing record " + record, e);
-			return null;
-		}
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
@ -1,264 +0,0 @@
-
-package eu.dnetlib.dhp.sx.graph.parser;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.commons.lang3.StringUtils;
-
-import com.ximpleware.AutoPilot;
-import com.ximpleware.VTDGen;
-import com.ximpleware.VTDNav;
-
-import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
-import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
-import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
-import eu.dnetlib.scholexplorer.relation.RelationMapper;
-
-public class PublicationScholexplorerParser extends AbstractScholexplorerParser {
-
-	@Override
-	public List<Oaf> parseObject(final String record, final RelationMapper relationMapper) {
-		try {
-			final List<Oaf> result = new ArrayList<>();
-			final DLIPublication parsedObject = new DLIPublication();
-			final VTDGen vg = new VTDGen();
-			vg.setDoc(record.getBytes());
-			vg.parse(true);
-
-			final VTDNav vn = vg.getNav();
-			final AutoPilot ap = new AutoPilot(vn);
-
-			final DataInfo di = new DataInfo();
-			di.setTrust("0.9");
-			di.setDeletedbyinference(false);
-			di.setInvisible(false);
-
-			String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']");
-			parsedObject.setDateofcollection(dateOfCollection);
-
-			final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
-			parsedObject
-				.setOriginalId(
-					Collections
-						.singletonList(
-							VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
-
-			if (StringUtils.isNotBlank(resolvedDate)) {
-				StructuredProperty currentDate = new StructuredProperty();
-				currentDate.setValue(resolvedDate);
-				final Qualifier dateQualifier = new Qualifier();
-				dateQualifier.setClassname("resolvedDate");
-				dateQualifier.setClassid("resolvedDate");
-				dateQualifier.setSchemename("dnet::date");
-				dateQualifier.setSchemeid("dnet::date");
-				currentDate.setQualifier(dateQualifier);
-				parsedObject.setRelevantdate(Collections.singletonList(currentDate));
-			}
-
-			final List<Node> pid = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap, vn, "//*[local-name()='pid']", Arrays.asList("type"));
-
-			StructuredProperty currentPid = extractIdentifier(pid, "type");
-			if (currentPid == null)
-				return null;
-			inferPid(currentPid);
-			parsedObject.setPid(Collections.singletonList(currentPid));
-			final String sourceId = generateId(
-				currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication");
-			parsedObject.setId(sourceId);
-
-			parsedObject
-				.setOriginalObjIdentifier(
-					VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
-
-			String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
-
-			List<Node> collectedFromNodes = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='collectedFrom']",
-					Arrays.asList("name", "id", "mode", "completionStatus"));
-
-			List<Node> resolvededFromNodes = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='resolvedFrom']",
-					Arrays.asList("name", "id", "mode", "completionStatus"));
-
-			final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']");
-			Field<String> pf = new Field<>();
-			pf.setValue(publisher);
-
-			parsedObject.setPublisher(pf);
-			final List<ProvenaceInfo> provenances = new ArrayList<>();
-			if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
-				collectedFromNodes
-					.forEach(
-						it -> {
-							final ProvenaceInfo provenance = new ProvenaceInfo();
-							provenance.setId(it.getAttributes().get("id"));
-							provenance.setName(it.getAttributes().get("name"));
-							provenance.setCollectionMode(provisionMode);
-							provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
-							provenances.add(provenance);
-						});
-			}
-
-			if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
-				resolvededFromNodes
-					.forEach(
-						it -> {
-							final ProvenaceInfo provenance = new ProvenaceInfo();
-							provenance.setId(it.getAttributes().get("id"));
-							provenance.setName(it.getAttributes().get("name"));
-							provenance.setCollectionMode("resolved");
-							provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
-							provenances.add(provenance);
-						});
-			}
-
-			parsedObject.setDlicollectedfrom(provenances);
-			parsedObject
-				.setCompletionStatus(
-					VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
-
-			parsedObject
-				.setCollectedfrom(
-					parsedObject
-						.getDlicollectedfrom()
-						.stream()
-						.map(
-							p -> {
-								final KeyValue cf = new KeyValue();
-								cf.setKey(p.getId());
-								cf.setValue(p.getName());
-								return cf;
-							})
-						.collect(Collectors.toList()));
-
-			final List<Node> relatedIdentifiers = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap,
-					vn,
-					"//*[local-name()='relatedIdentifier']",
-					Arrays
-						.asList(
-							"relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
-			generateRelations(
-				relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers);
-
-			final List<Node> hostedBy = VtdUtilityParser
-				.getTextValuesWithAttributes(
-					ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
-
-			if (hostedBy != null) {
-				parsedObject
-					.setInstance(
-						hostedBy
-							.stream()
-							.map(
-								it -> {
-									final Instance i = new Instance();
-									i.setUrl(Collections.singletonList(currentPid.getValue()));
-									KeyValue h = new KeyValue();
-									i.setHostedby(h);
-									h.setKey(it.getAttributes().get("id"));
-									h.setValue(it.getAttributes().get("name"));
-									return i;
-								})
-							.collect(Collectors.toList()));
-			}
-
-			final List<String> authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']");
-			if (authorsNode != null)
-				parsedObject
-					.setAuthor(
-						authorsNode
-							.stream()
-							.map(
-								a -> {
-									final Author author = new Author();
-									author.setFullname(a);
-									return author;
-								})
-							.collect(Collectors.toList()));
-
-			final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']");
-			if (titles != null) {
-				parsedObject
-					.setTitle(
-						titles
-							.stream()
-							.map(
-								t -> {
-									final StructuredProperty st = new StructuredProperty();
-									st.setValue(t);
-									st
-										.setQualifier(
-											generateQualifier(
-												"main title", "main title", "dnet:dataCite_title",
-												"dnet:dataCite_title"));
-									return st;
-								})
-							.collect(Collectors.toList()));
-			}
-
-			Field<String> description = new Field<>();
-
-			description
-				.setValue(
-					VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']"));
-
-//			if (StringUtils.isNotBlank(description.getValue())
-//				&& description.getValue().length() > 10000) {
-//				description.setValue(description.getValue().substring(0, 10000));
-//			}
-
-			parsedObject.setDescription(Collections.singletonList(description));
-
-			final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']");
-
-			StructuredProperty date = new StructuredProperty();
-			date.setValue(cd);
-			final Qualifier dq = new Qualifier();
-			dq.setClassname("date");
-			dq.setClassid("date");
-			dq.setSchemename("dnet::date");
-			dq.setSchemeid("dnet::date");
-			date.setQualifier(dq);
-			parsedObject.setRelevantdate(Collections.singletonList(date));
-
-			List<StructuredProperty> subjects = extractSubject(
-				VtdUtilityParser
-					.getTextValuesWithAttributes(
-						ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme")));
-			parsedObject.setSubject(subjects);
-
-			parsedObject.setDataInfo(di);
-
-			parsedObject.setSubject(subjects);
-			Qualifier q = new Qualifier();
-			q.setClassname("publication");
-			q.setClassid("publication");
-			q.setSchemename("publication");
-			q.setSchemeid("publication");
-			parsedObject.setResulttype(q);
-			result.add(parsedObject);
-			return result;
-
-		} catch (Throwable e) {
-			log.error("Input record: " + record);
-			log.error("Error on parsing record ", e);
-			return null;
-		}
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -0,0 +1,361 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+
+import eu.dnetlib.dhp.schema.oaf.{Dataset, Relation, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource}
+import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
+import eu.dnetlib.dhp.utils.DHPUtils
+import org.apache.spark.sql.Encoders.bean
+import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql.{Encoder, Encoders}
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+import scala.io.Source
+import scala.language.postfixOps
+
+object ScholixUtils {
+
+
+  val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
+
+  val DATE_RELATION_KEY:String = "RelationDate"
+  case class RelationVocabulary(original:String, inverse:String){}
+
+  case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){}
+
+  val relations:Map[String, RelationVocabulary] = {
+    val input =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+
+    lazy val json: json4s.JValue = parse(input)
+
+    json.extract[Map[String, RelationVocabulary]]
+  }
+
+
+  def extractRelationDate(relation: Relation):String = {
+
+    if (relation.getProperties== null || !relation.getProperties.isEmpty)
+      null
+    else {
+      val date =relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
+      if (date.isDefined)
+        date.get
+      else
+        null
+    }
+  }
+
+  def extractRelationDate(summary: ScholixSummary):String = {
+
+    if(summary.getDate== null || summary.getDate.isEmpty)
+      null
+    else {
+      summary.getDate.get(0)
+    }
+
+
+  }
+
+  def inverseRelationShip(rel:ScholixRelationship):ScholixRelationship = {
+    new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
+
+
+  }
+
+
+
+  val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with  Serializable {
+    override def zero: RelatedEntities = null
+
+    override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
+      val id = a._1
+      val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
+      val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
+
+      if (b == null)
+        RelatedEntities(a._1, relatedDataset, relatedPublication)
+      else
+        RelatedEntities(a._1,b.relatedDataset+ relatedDataset, b.relatedPublication+ relatedPublication )
+    }
+
+    override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
+      if (b1!= null && b2!= null)
+        RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication)
+
+      else
+        if (b1!= null)
+          b1
+        else
+        b2
+    }
+
+    override def finish(reduction: RelatedEntities): RelatedEntities = reduction
+
+    override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
+
+    override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
+  }
+
+
+  val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
+    override def zero: Scholix = null
+
+
+    def scholix_complete(s:Scholix):Boolean ={
+      if (s== null || s.getIdentifier==null) {
+        false
+      } else if (s.getSource == null || s.getTarget == null) {
+          false
+        }
+      else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
+        false
+      else
+        true
+    }
+
+    override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
+      if (scholix_complete(b)) b else a._2
+    }
+
+    override def merge(b1: Scholix, b2: Scholix): Scholix = {
+      if (scholix_complete(b1)) b1 else  b2
+    }
+
+    override def finish(reduction: Scholix): Scholix = reduction
+
+    override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
+
+    override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
+  }
+
+
+  def createInverseScholixRelation(scholix: Scholix):Scholix = {
+    val s = new Scholix
+    s.setPublicationDate(scholix.getPublicationDate)
+    s.setPublisher(scholix.getPublisher)
+    s.setLinkprovider(scholix.getLinkprovider)
+    s.setRelationship(inverseRelationShip(scholix.getRelationship))
+    s.setSource(scholix.getTarget)
+    s.setTarget(scholix.getSource)
+    s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+    s
+
+
+
+  }
+
+
+  def extractCollectedFrom(summary:ScholixSummary): List[ScholixEntityId] = {
+    if (summary.getDatasources!= null && !summary.getDatasources.isEmpty) {
+      val l: List[ScholixEntityId] = summary.getDatasources.asScala.map{
+        d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
+      }(collection.breakOut)
+       l
+    } else List()
+  }
+
+  def extractCollectedFrom(relation: Relation) : List[ScholixEntityId] = {
+    if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
+
+
+      val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
+        c =>
+
+          new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA,null)).asJava)
+      }(collection breakOut)
+      l
+    } else List()
+  }
+
+
+  def generateCompleteScholix(scholix: Scholix, target:ScholixSummary): Scholix = {
+    val s = new Scholix
+    s.setPublicationDate(scholix.getPublicationDate)
+    s.setPublisher(scholix.getPublisher)
+    s.setLinkprovider(scholix.getLinkprovider)
+    s.setRelationship(scholix.getRelationship)
+    s.setSource(scholix.getSource)
+    s.setTarget(generateScholixResourceFromSummary(target))
+    s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+    s
+  }
+
+
+  def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
+    val r = new ScholixResource
+    r.setIdentifier(summaryObject.getLocalIdentifier)
+    r.setDnetIdentifier(summaryObject.getId)
+
+    r.setObjectType(summaryObject.getTypology.toString)
+    r.setObjectSubType(summaryObject.getSubType)
+
+    if (summaryObject.getTitle!= null && !summaryObject.getTitle.isEmpty)
+        r.setTitle(summaryObject.getTitle.get(0))
+
+    if (summaryObject.getAuthor!= null && !summaryObject.getAuthor.isEmpty){
+      val l:List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a,null)).toList
+      if (l.nonEmpty)
+        r.setCreator(l.asJava)
+    }
+
+    if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty)
+      r.setPublicationDate(summaryObject.getDate.get(0))
+    if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty)
+    {
+      val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
+
+      if (plist.nonEmpty)
+        r.setPublisher(plist.asJava)
+    }
+
+
+    if (summaryObject.getDatasources!= null && !summaryObject.getDatasources.isEmpty) {
+
+      val l:List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
+        new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
+        , "collected", "complete"
+
+      )).toList
+
+      if (l.nonEmpty)
+        r.setCollectedFrom(l.asJava)
+
+    }
+    r
+  }
+
+
+
+
+
+  def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = {
+
+    if (relation== null || source== null)
+      return null
+
+    val s = new Scholix
+
+    var l: List[ScholixEntityId] = extractCollectedFrom(relation)
+    if (l.isEmpty)
+      l = extractCollectedFrom(source)
+    if (l.isEmpty)
+      return null
+
+    s.setLinkprovider(l.asJava)
+
+    var d = extractRelationDate(relation)
+    if (d == null)
+      d = extractRelationDate(source)
+
+    s.setPublicationDate(d)
+
+
+    if (source.getPublisher!= null && !source.getPublisher.isEmpty) {
+      val l: List[ScholixEntityId] = source.getPublisher.asScala
+        .map{
+          p =>
+            new ScholixEntityId(p, null)
+        }(collection.breakOut)
+
+      if (l.nonEmpty)
+        s.setPublisher(l.asJava)
+    }
+
+    val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
+    if (semanticRelation== null)
+      return null
+    s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
+    s.setSource(generateScholixResourceFromSummary(source))
+
+    s
+  }
+
+
+  def findURLForPID(pidValue:List[StructuredProperty], urls:List[String]):List[(StructuredProperty, String)] = {
+    pidValue.map{
+      p =>
+        val pv = p.getValue
+
+        val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+        (p, r.orNull)
+    }
+  }
+
+
+  def extractTypedIdentifierFromInstance(r:Result):List[ScholixIdentifier] = {
+    if (r.getInstance() == null || r.getInstance().isEmpty)
+      return List()
+    r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty)
+
+      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
+  }
+
+  def resultToSummary(r:Result):ScholixSummary = {
+    val s = new ScholixSummary
+    s.setId(r.getId)
+    if (r.getPid == null || r.getPid.isEmpty)
+      return null
+
+    val pids:List[ScholixIdentifier] =  extractTypedIdentifierFromInstance(r)
+    if (pids.isEmpty)
+      return null
+    s.setLocalIdentifier(pids.asJava)
+    if (r.isInstanceOf[Dataset])
+      s.setTypology(Typology.dataset)
+    else
+      s.setTypology(Typology.publication)
+
+    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
+
+    if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) {
+      val titles:List[String] =r.getTitle.asScala.map(t => t.getValue)(collection breakOut)
+      if (titles.nonEmpty)
+        s.setTitle(titles.asJava)
+      else
+        return  null
+    }
+
+    if(r.getAuthor!= null && !r.getAuthor.isEmpty) {
+      val authors:List[String] = r.getAuthor.asScala.map(a=> a.getFullname)(collection breakOut)
+      if (authors nonEmpty)
+        s.setAuthor(authors.asJava)
+    }
+    if (r.getInstance() != null) {
+      val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut)
+      if (dt.nonEmpty)
+        s.setDate(dt.distinct.asJava)
+    }
+    if (r.getDescription!= null && !r.getDescription.isEmpty) {
+      val d = r.getDescription.asScala.find(f => f!= null && f.getValue!=null)
+      if (d.isDefined)
+        s.setDescription(d.get.getValue)
+    }
+
+    if (r.getSubject!= null && !r.getSubject.isEmpty) {
+      val subjects:List[SchemeValue] =r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut)
+      if (subjects.nonEmpty)
+        s.setSubject(subjects.asJava)
+    }
+
+    if (r.getPublisher!= null)
+      s.setPublisher(List(r.getPublisher.getValue).asJava)
+
+    if (r.getCollectedfrom!= null && !r.getCollectedfrom.isEmpty) {
+      val cf:List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut)
+      if (cf.nonEmpty)
+        s.setDatasources(cf.distinct.asJava)
+    }
+
+    s.setRelatedDatasets(0)
+    s.setRelatedPublications(0)
+    s.setRelatedUnknown(0)
+
+    s
+  }
+
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
dimitrispie	aedd279f78	Updates Promotion DBs - Add a step for promoting the splitted monitor DBs	2023-07-13 15:35:46 +03:00
Claudio Atzori	5b6844b969	mapping funding relations from Datacite should be done according to the actual result identifier	2021-07-23 18:14:37 +02:00
Claudio Atzori	ffdb2a3ea3	[cleaning] fixed filtering function for missing titles	2021-07-23 11:55:55 +02:00
Alessia Bardi	9069958479	tests for enermaps	2021-07-20 19:31:43 +02:00
Claudio Atzori	77e8c6c7f7	filtering 'old' OpenAIRE ids from the entity.originalId[] array in the OAF -> XML searialization procedure	2021-07-20 11:51:33 +02:00
Claudio Atzori	5947cddafc	adding record identifier among the originalIds regardless of what IdentifierFactory produces	2021-07-19 17:52:24 +02:00
Miriam Baglioni	13cf444f85	Merge pull request 'force orginalId for claimed records' (#124 ) from forceOrginalId_claims into master Reviewed-on: D-Net/dnet-hadoop#124	2021-07-19 17:41:58 +02:00
Claudio Atzori	5e5f65a3c3	contents mapped from the stores with 'claim' interpretation will not change their identifier along their way towards the graph	2021-07-19 15:56:55 +02:00
Claudio Atzori	9913b6073c	Merge pull request 'orcid-no-doi' (#123 ) from enrico.ottonello/dnet-hadoop:orcid-no-doi into master Reviewed-on: D-Net/dnet-hadoop#123	2021-07-15 17:53:58 +02:00
Enrico Ottonello	2dc50c0999	added default value to process path	2021-07-14 17:02:22 +02:00
Enrico Ottonello	66604bb2b4	added absolute path to process folder	2021-07-14 16:44:51 +02:00
Enrico Ottonello	7840cc6526	merged with master	2021-07-14 15:33:59 +02:00
Enrico Ottonello	a65667d217	added publication to dataset even if no contributors	2021-07-14 15:07:07 +02:00
Sandro La Bruzzo	10068c00ea	Code refactor: - removed old workflows in doiboost - splitted workflow of doiboost in preprocess and process	2021-07-14 14:45:50 +02:00
Miriam Baglioni	1cdd09cd8e	Tentative fix for testing of Jenkins	2021-07-14 11:14:59 +02:00
Sandro La Bruzzo	4cb65bc64a	fixed process doiboost workflow: - splitted OrcidToOAF into two phase preprocess and process - updated workflow used in production	2021-07-14 09:44:32 +02:00
Claudio Atzori	734de62474	[doiboost] added workflow for the ActionSet update dedicated to production	2021-07-13 17:26:04 +02:00
Claudio Atzori	fa720c1da4	[doiboost] added workflow for the ActionSet update dedicated to production	2021-07-13 16:59:30 +02:00
Claudio Atzori	9629569e22	Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop	2021-07-13 16:04:08 +02:00
Claudio Atzori	f13e11e3f7	[aggregation] datacite wf: defined parameter declaring the path used to store the OAF objects produced by the transformation phase	2021-07-13 16:04:02 +02:00
Miriam Baglioni	f5486ffb14	Fixed issues to tests	2021-07-13 14:07:45 +02:00
Claudio Atzori	e0061232e9	[aggregation] datacite wf: conditional creation of links, optional resume from intermediate phases	2021-07-13 13:41:21 +02:00
Claudio Atzori	28a66af425	updated URL in the issueManagement tag	2021-07-13 11:52:24 +02:00
Claudio Atzori	783988af06	depending on dhp-schemas:2.6.14 (release)	2021-07-13 11:17:25 +02:00
Claudio Atzori	9038fdc771	depending on dhp-schemas:2.7.14 (release)	2021-07-12 17:46:12 +02:00
Sandro La Bruzzo	bbe8193930	merged stable ids	2021-07-12 17:00:43 +02:00
Sandro La Bruzzo	57c74c73c6	fixed mistakes in oozie workflow	2021-07-09 12:28:09 +02:00
Sandro La Bruzzo	61ccb54fde	removed wrong loop on oozie wf	2021-07-09 12:17:57 +02:00
Sandro La Bruzzo	9f5a0f3ab6	moved wf indexing of Scholexplorer in dhp-graph-provision	2021-07-09 12:06:43 +02:00
Sandro La Bruzzo	09fccf8000	added workflow to serialize scholix and summary in json	2021-07-09 11:01:42 +02:00
Sandro La Bruzzo	0ea576745f	updated CreateInputGraph because ggenerics don't work on Spark Dataset	2021-07-09 10:29:24 +02:00
Sandro La Bruzzo	cd17e19044	implemented branch workflow to import datacite and crossref in scholexplorer	2021-07-08 21:20:19 +02:00
Sandro La Bruzzo	8a034e46e1	updated baseline workflow	2021-07-08 11:11:41 +02:00
Sandro La Bruzzo	0799ac9fb6	fixed wrong path	2021-07-08 10:36:37 +02:00
Sandro La Bruzzo	4d53402712	extended ebiLinks to create a dataset before generation of OAF	2021-07-08 10:26:21 +02:00
Sandro La Bruzzo	a4a54a3786	code refactor	2021-07-08 09:08:25 +02:00
Sandro La Bruzzo	a01dbe0ab0	completed workflow of generation of scholix and summaries	2021-07-07 23:10:34 +02:00
Sandro La Bruzzo	ed684874f2	deleted old scholix project	2021-07-06 17:20:08 +02:00
Sandro La Bruzzo	8535506c22	added scholix generation	2021-07-06 17:18:06 +02:00
Sandro La Bruzzo	4c54bd8742	add test to verify merge scholix on source	2021-07-06 11:32:14 +02:00
Sandro La Bruzzo	7d8db2eb8a	betterRenamingMethod	2021-07-06 09:56:32 +02:00
Sandro La Bruzzo	c952c8d236	generate first side of scholix mapping	2021-07-06 09:53:14 +02:00
Sandro La Bruzzo	e4b84ef5d6	fixed mapping OAF to Scholix summary	2021-07-02 16:48:48 +02:00
Sandro La Bruzzo	8fa0841898	Merge remote-tracking branch 'origin/stable_ids' into stable_id_scholexplorer	2021-07-01 22:14:04 +02:00
Sandro La Bruzzo	c6fa8598e1	massive code refactor: removed modules dhp-*-scholexplorer	2021-07-01 22:13:45 +02:00
Antonis Lempesis	829caee4fd	added the missing indicators files	2021-06-30 17:31:33 +02:00
Sandro La Bruzzo	84b834c893	added test dataset test for pangaea	2021-06-30 17:31:09 +02:00
Sandro La Bruzzo	1a6b398968	implemented Creation of Raw Graph and Resolution	2021-06-30 17:27:55 +02:00
Sandro La Bruzzo	623a0c4edb	code Refactor, renaming packages	2021-06-30 11:09:30 +02:00
Sandro La Bruzzo	db933ebd21	Merge remote-tracking branch 'origin/stable_ids' into stable_id_scholexplorer	2021-06-29 14:16:12 +02:00
Sandro La Bruzzo	7e08655e5f	added relation dates in all scholexplorer Datasources	2021-06-29 12:02:03 +02:00
Sandro La Bruzzo	075055eaca	added relation dates in bio mapping	2021-06-29 10:33:09 +02:00
Sandro La Bruzzo	f36f92287d	implemented mapping from Crossref Event Data to Oaf	2021-06-29 10:21:23 +02:00
Sandro La Bruzzo	511ec14c63	implemented mapping from EBI and Scholix Resolved to OAF	2021-06-28 22:04:22 +02:00
Sandro La Bruzzo	ad50415167	Merge remote-tracking branch 'origin/stable_ids' into stable_id_scholexplorer	2021-06-24 17:20:50 +02:00
Sandro La Bruzzo	80e15cc455	implemented mapping from uniprot, pdb and ebi links	2021-06-24 17:20:00 +02:00
Sandro La Bruzzo	080a280bea	added pdb to Oaf Transformation	2021-06-21 16:23:59 +02:00
Sandro La Bruzzo	1dc0c59e20	merged fix thai dates from stable_ids	2021-06-21 10:39:46 +02:00
Sandro La Bruzzo	dc66cf615b	Merge branch 'stable_id_scholexplorer' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_id_scholexplorer	2021-06-21 09:38:33 +02:00
Sandro La Bruzzo	507e42102a	added pdb to oaf class	2021-06-21 09:36:40 +02:00
Sandro La Bruzzo	a167543637	Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_id_scholexplorer	2021-06-21 09:14:11 +02:00
Sandro La Bruzzo	4fe7b75644	renamed packages	2021-06-18 16:41:24 +02:00
Sandro La Bruzzo	3990165d05	changed typologies of unresolved relation	2021-06-18 11:43:59 +02:00
Sandro La Bruzzo	3100166d29	Merge remote-tracking branch 'origin/stable_ids' into stable_id_scholexplorer	2021-06-16 16:22:16 +02:00
Sandro La Bruzzo	dfcf78cf24	removed wrong code	2021-06-16 14:57:42 +02:00
Sandro La Bruzzo	cc0f2b11fb	Implemented mapping from pubmed baseline to OAF	2021-06-16 14:56:24 +02:00
Sandro La Bruzzo	aeb8132627	Merged branch stable_ids	2021-06-14 10:07:29 +02:00
Sandro La Bruzzo	efbea1e01a	minor fix	2021-06-14 09:45:14 +02:00
Sandro La Bruzzo	0d1f37302f	Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_id_scholexplorer	2021-06-09 09:35:16 +02:00
Sandro La Bruzzo	0cdb7ccdaa	added inverse relations to datacite mapping	2021-06-04 15:10:20 +02:00
Sandro La Bruzzo	5b724d9972	added relations to datacite mapping	2021-06-04 10:14:22 +02:00
Enrico Ottonello	abdd0ade1f	added temporary output folder as workflow parameter	2021-05-21 12:08:16 +02:00
Enrico Ottonello	d0945c3c78	added temporary output folder, because of folder access rights are different on beta and prod	2021-05-20 19:14:31 +02:00
Enrico Ottonello	1265dadc90	workflow aligned with stable_ids	2021-05-20 19:01:28 +02:00
Enrico Ottonello	0821d8e97d	Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi	2021-05-20 18:33:18 +02:00
Enrico Ottonello	ae7bd24d79	removed old workflows	2021-05-20 18:32:22 +02:00
Enrico Ottonello	4d6c473bf1	removed redundant classes contained now in dhp-schema	2021-05-20 18:26:42 +02:00
Claudio Atzori	ea9b00ce56	adjusted test	2021-05-20 15:31:42 +02:00
Claudio Atzori	2e70aa43f0	Merge pull request 'H2020Classification fix and possibility to add datasources in blacklist for propagation of result to organization' (#108 ) from miriam.baglioni/dnet-hadoop:master into master Reviewed-on: D-Net/dnet-hadoop#108 The changes look ok, but please drop a comment to describe how the parameters should be changed from the workflow caller for both workflows * H2020Classification * propagation of result to organization	2021-05-20 15:25:05 +02:00
Enrico Ottonello	e13926cdd0	merged with master	2021-05-14 18:10:31 +02:00