mergin with branch beta

2021-08-04 10:21:37 +02:00 · 2021-08-04 10:21:37 +02:00 · b4eb026c8b
parent c4b18e6ccb 3fc820203b
commit b4eb026c8b
91 changed files with 9648 additions and 558 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,2 @@
 # dnet-hadoop
-Dnet-hadoop is a tool for
+Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -25,6 +25,11 @@
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>me.xuender</groupId>
 			<artifactId>unidecode</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.spark</groupId>
 			<artifactId>spark-core_2.11</artifactId>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -67,6 +67,7 @@ public class AuthorMerger {
 				a -> a
 					.getPid()
 					.stream()
 					.filter(Objects::nonNull)
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
@ -78,6 +79,7 @@ public class AuthorMerger {
 				a -> a
 					.getPid()
 					.stream()
 					.filter(Objects::nonNull)
 					.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
 					.map(p -> new Tuple2<>(p, a)))
 			.collect(Collectors.toList());
@ -150,7 +152,7 @@ public class AuthorMerger {
 	}
 	private static boolean hasPid(Author a) {
-		if (a == null || a.getPid() == null || a.getPid().size() == 0)
+		if (a == null || a.getPid() == null || a.getPid().isEmpty())
 			return false;
 		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
 	}
@ -159,7 +161,10 @@ public class AuthorMerger {
 		if (StringUtils.isNotBlank(author.getSurname())) {
 			return new Person(author.getSurname() + ", " + author.getName(), false);
 		} else {
 			if (StringUtils.isNotBlank(author.getFullname()))
 				return new Person(author.getFullname(), false);
 			else
 				return new Person("", false);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeParseException;
 import java.util.*;
 import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.lang3.StringUtils;
 import org.jetbrains.annotations.NotNull;
 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import me.xuender.unidecode.Unidecode;
 public class GraphCleaningFunctions extends CleaningFunctions {
@ -98,7 +95,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 			Result r = (Result) value;
-			if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
+			if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) {
 				return false;
 			}
@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 							.filter(Objects::nonNull)
 							.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
 							.filter(
-								sp -> sp
+								sp -> {
 									final String title = sp
 										.getValue()
-									.toLowerCase()
+										.toLowerCase();
-									.replaceAll(TITLE_FILTER_REGEX, "")
+									final String residual = Unidecode
-									.length() > TITLE_FILTER_RESIDUAL_LENGTH)
+										.decode(title)
 										.replaceAll(TITLE_FILTER_REGEX, "");
 									return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
 								})
 							.map(GraphCleaningFunctions::cleanValue)
 							.collect(Collectors.toList()));
 			}
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
 import static org.junit.jupiter.api.Assertions.*;
 import java.io.IOException;
 import java.time.LocalDate;
 import java.time.format.DateTimeFormatter;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import me.xuender.unidecode.Unidecode;
 public class OafMapperUtilsTest {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
 		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
 	@Test
 	public void testUnidecode() {
 		assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
 		assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
 		assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
 		assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
 		assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
 		assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
 		assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
 		assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
 		assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
 		assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
 	}
 	@Test
 	public void testDateValidation() {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
@ -1,9 +1,10 @@
 package eu.dnetlib.dhp.actionmanager.datacite
 import org.apache.commons.io.IOUtils
 import org.apache.http.client.config.RequestConfig
 import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
 import org.apache.http.entity.StringEntity
-import org.apache.http.impl.client.HttpClients
+import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}
 import java.io.IOException
@ -56,12 +57,16 @@ abstract class AbstractRestClient extends Iterator[String]{
  private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
-    val client = HttpClients.createDefault
+    val timeout = 60; // seconds
    val config = RequestConfig.custom()
      .setConnectTimeout(timeout * 1000)
      .setConnectionRequestTimeout(timeout * 1000)
      .setSocketTimeout(timeout * 1000).build()
    val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
    var tries = 4
    try {
       while (tries > 0) {
        println(s"requesting ${r.getURI}")
        try {
          val response = client.execute(r)
          println(s"get response with status${response.getStatusLine.getStatusCode}")
          if (response.getStatusLine.getStatusCode > 400) {
@ -69,18 +74,14 @@ abstract class AbstractRestClient extends Iterator[String]{
          }
          else
            return IOUtils.toString(response.getEntity.getContent)
      }
      ""
        } catch {
          case e: Throwable =>
-        throw new RuntimeException("Error on executing request ", e)
+            println(s"Error on requesting ${r.getURI}")
-    } finally try client.close()
+            e.printStackTrace()
-    catch {
+            tries-=1
      case e: IOException =>
        throw new RuntimeException("Unable to close client ", e)
        }
      }
-
+      ""
   }
  getBufferData()
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
@ -367,7 +367,7 @@ object DataciteToOAFTransformation {
    result.setDateofcollection(ISO8601FORMAT.format(d))
-    result.setDateoftransformation(ISO8601FORMAT.format(ts))
+    result.setDateoftransformation(ISO8601FORMAT.format(d))
    result.setDataInfo(dataInfo)
    val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
@ -532,11 +532,11 @@ object DataciteToOAFTransformation {
      JField("awardUri", JString(awardUri)) <- fundingReferences
    } yield awardUri
    result.setId(IdentifierFactory.createIdentifier(result))
    var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
    fix_figshare(result)
-    result.setId(IdentifierFactory.createIdentifier(result))
+
    if (result.getId == null)
      return List()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala
@ -140,7 +140,7 @@ object ImportDatacite {
  private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
    var from:Long = timestamp * 1000
-    val delta:Long = 50000000L
+    val delta:Long = 100000000L
    var client: DataciteAPIImporter = null
    val now :Long =System.currentTimeMillis()
    var i = 0
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
@ -0,0 +1,73 @@
 package eu.dnetlib.dhp.actionmanager.scholix
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 import scala.io.Source
 object SparkCreateActionset {
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val sourcePath = parser.get("sourcePath")
    log.info(s"sourcePath  -> $sourcePath")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath  -> $targetPath")
    val workingDirFolder = parser.get("workingDirFolder")
    log.info(s"workingDirFolder  -> $workingDirFolder")
    implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
    implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
    import spark.implicits._
    val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
    relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
      .flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
    val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
    log.info("extract source and target Identifier involved in relations")
    log.info("save relation filtered")
    relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
      .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
    log.info("saving entities")
    val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
    entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
    entities
      .joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
      .map(p => p._1._2)
      .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala
@ -0,0 +1,86 @@
 package eu.dnetlib.dhp.actionmanager.scholix
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
 import org.apache.hadoop.io.Text
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.hadoop.mapred.SequenceFileOutputFormat
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import scala.io.Source
 object SparkSaveActionSet {
  def toActionSet(item: Oaf): (String, String) = {
    val mapper = new ObjectMapper()
    item match {
      case dataset: OafDataset =>
        val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
        a.setClazz(classOf[OafDataset])
        a.setPayload(dataset)
        (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case publication: Publication =>
        val a: AtomicAction[Publication] = new AtomicAction[Publication]
        a.setClazz(classOf[Publication])
        a.setPayload(publication)
        (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case software: Software =>
        val a: AtomicAction[Software] = new AtomicAction[Software]
        a.setClazz(classOf[Software])
        a.setPayload(software)
        (software.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case orp: OtherResearchProduct =>
        val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
        a.setClazz(classOf[OtherResearchProduct])
        a.setPayload(orp)
        (orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case relation: Relation =>
        val a: AtomicAction[Relation] = new AtomicAction[Relation]
        a.setClazz(classOf[Relation])
        a.setPayload(relation)
        (relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case _ =>
        null
    }
  }
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val sourcePath = parser.get("sourcePath")
    log.info(s"sourcePath  -> $sourcePath")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath  -> $targetPath")
    implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
    spark.read.load(sourcePath).as[Oaf]
      .map(o => toActionSet(o))
      .filter(o => o != null)
      .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
@ -16,7 +16,7 @@
    </parameters>
-    <start to="TransformJob"/>
+    <start to="ImportDatacite"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json
@ -0,0 +1,6 @@
 [
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",      "paramRequired": true},
  {"paramName":"s",   "paramLongName":"sourcePath","paramDescription": "source path",                   "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingDirFolder","paramDescription": "the working Dir Folder",  "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath","paramDescription": "the target path ",              "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml
@ -0,0 +1,23 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
@ -0,0 +1,76 @@
 <workflow-app name="Scholexplorer_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the path of the consistent graph</description>
        </property>
        <property>
            <name>workingDirFolder</name>
            <description>the path of working dir ActionSet</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the path of Scholexplorer ActionSet</description>
        </property>
    </parameters>
    <start to="createActionSet"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="createActionSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create Action Set</name>
            <class>eu.dnetlib.dhp.actionmanager.scholix.SparkCreateActionset</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--targetPath</arg><arg>${outputPath}</arg>
            <arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="SaveActionSet"/>
        <error to="Kill"/>
    </action>
    <action name="SaveActionSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Save Action Set</name>
            <class>eu.dnetlib.dhp.actionmanager.scholix.SparkSaveActionSet</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDirFolder}/actionSetOaf</arg>
            <arg>--targetPath</arg><arg>${outputPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json
@ -0,0 +1,5 @@
 [
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",      "paramRequired": true},
  {"paramName":"s",   "paramLongName":"sourcePath","paramDescription": "source path",                   "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath","paramDescription": "the target path ",              "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
@ -3,13 +3,14 @@ package eu.dnetlib.dhp.actionmanager.datacite
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.databind.SerializationFeature
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import org.junit.jupiter.api.extension.ExtendWith
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
 import java.text.SimpleDateFormat
 import java.util.Locale
 import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
@ -22,6 +23,18 @@ class DataciteToOAFTest extends  AbstractVocabularyTest{
    super.setUpVocabulary()
  }
  @Test
  def testDateMapping:Unit = {
    val inputDate = "2021-07-14T11:52:54+0000"
    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
    val dt = ISO8601FORMAT.parse(inputDate)
    println(dt.getTime)
  }
  @Test
  def testMapping() :Unit = {
    val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isReferencedBy");
+		return relType.equals(ModelConstants.IS_REFERENCED_BY);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isRelatedTo");
+		return relType.equals(ModelConstants.IS_RELATED_TO);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedBy");
+		return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedTo");
+		return relType.equals(ModelConstants.IS_SUPPLEMENT_TO);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("references");
+		return relType.equals(ModelConstants.REFERENCES);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication {
@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isReferencedBy");
+		return relType.equals(ModelConstants.IS_REFERENCED_BY);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication {
@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isRelatedTo");
+		return relType.equals(ModelConstants.IS_RELATED_TO);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication {
@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedBy");
+		return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication {
@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedTo");
+		return relType.equals(ModelConstants.IS_SUPPLEMENT_TO);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication {
@ -11,7 +12,7 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("references");
+		return relType.equals(ModelConstants.REFERENCES);
 	}
 }
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class ClusterUtils {
@ -52,15 +53,15 @@ public class ClusterUtils {
 	}
 	public static boolean isDedupRoot(final String id) {
-		return id.contains("dedup_wf_");
+		return id.contains("dedup");
 	}
 	public static final boolean isValidResultResultClass(final String s) {
-		return s.equals("isReferencedBy")
+		return s.equals(ModelConstants.IS_REFERENCED_BY)
-			|| s.equals("isRelatedTo")
+			|| s.equals(ModelConstants.IS_RELATED_TO)
-			|| s.equals("references")
+			|| s.equals(ModelConstants.REFERENCES)
-			|| s.equals("isSupplementedBy")
+			|| s.equals(ModelConstants.IS_SUPPLEMENTED_BY)
-			|| s.equals("isSupplementedTo");
+			|| s.equals(ModelConstants.IS_SUPPLEMENT_TO);
 	}
 	public static <T> T incrementAccumulator(final T o, final LongAccumulator acc) {
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java
@ -23,6 +23,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
@ -77,17 +78,22 @@ public class SparkUpdateEntity extends AbstractSparkAction {
 				(type, clazz) -> {
 					final String outputPath = dedupGraphPath + "/" + type;
 					removeOutputDir(spark, outputPath);
-
+					final String ip = DedupUtility.createEntityPath(graphBasePath, type.toString());
 					if (HdfsSupport.exists(ip, sc.hadoopConfiguration())) {
 						JavaRDD<String> sourceEntity = sc
 							.textFile(DedupUtility.createEntityPath(graphBasePath, type.toString()));
 						if (mergeRelExists(workingPath, type.toString())) {
-						final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, "*", type.toString());
+							final String mergeRelPath = DedupUtility
 								.createMergeRelPath(workingPath, "*", type.toString());
 							final String dedupRecordPath = DedupUtility
 								.createDedupRecordPath(workingPath, "*", type.toString());
-						final Dataset<Relation> rel = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
+							final Dataset<Relation> rel = spark
 								.read()
 								.load(mergeRelPath)
 								.as(Encoders.bean(Relation.class));
 							final JavaPairRDD<String, String> mergedIds = rel
 								.where("relClass == 'merges'")
@ -119,6 +125,7 @@ public class SparkUpdateEntity extends AbstractSparkAction {
 						}
 						sourceEntity.saveAsTextFile(outputPath, GzipCodec.class);
 					}
 				});
 	}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@ -1,12 +1,16 @@
 package eu.dnetlib.doiboost
 import java.time.LocalDate
 import java.time.format.DateTimeFormatter
 import eu.dnetlib.dhp.schema.action.AtomicAction
-import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instance, KeyValue, Oaf, OpenAccessRoute, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
 import eu.dnetlib.doiboost.DoiBoostMappingUtil.{getClosedAccessQualifier, getEmbargoedAccessQualifier, getUnknownQualifier}
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
@ -118,10 +122,74 @@ object DoiBoostMappingUtil {
  }
  def decideAccessRight(lic : Field[String], date:String) : AccessRight = {
    if(lic == null){
      //Default value Unknown
      return getUnknownQualifier()
    }
    val license : String = lic.getValue
    //CC licenses
    if(license.startsWith("cc") ||
      license.startsWith("http://creativecommons.org/licenses") ||
      license.startsWith("https://creativecommons.org/licenses") ||
      //ACS Publications Author choice licenses (considered OPEN also by Unpaywall)
      license.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html") ||
      license.equals("http://pubs.acs.org/page/policy/authorchoice_termsofuse.html") ||
      license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
      //APA (considered OPEN also by Unpaywall)
      license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){
      val oaq : AccessRight = getOpenAccessQualifier()
      oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
      return oaq
    }
    //OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
    if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){
      val now = java.time.LocalDate.now
      try{
        val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd"))
        if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
          val oaq : AccessRight = getOpenAccessQualifier()
          oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
          return oaq
        }
        else{
          return getEmbargoedAccessQualifier()
        }
      }catch {
        case e: Exception => {
          try{
          val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
          if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
            val oaq : AccessRight = getOpenAccessQualifier()
            oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
            return oaq
          }
          else{
            return getEmbargoedAccessQualifier()
          }
          }catch{
            case ex: Exception => return getClosedAccessQualifier()
          }
        }
      }
    }
    return getClosedAccessQualifier()
  }
  def getOpenAccessQualifier():AccessRight = {
-    OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+
    OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
  }
  def getRestrictedQualifier():AccessRight = {
@ -129,6 +197,20 @@ object DoiBoostMappingUtil {
  }
  def getUnknownQualifier():AccessRight = {
    OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
  }
  def getEmbargoedAccessQualifier():AccessRight = {
    OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
  }
  def getClosedAccessQualifier():AccessRight = {
    OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
  }
  def extractInstance(r:Result):Option[Instance] = {
    r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
  }
@ -150,10 +232,11 @@ object DoiBoostMappingUtil {
      if (item != null) {
        hb.setValue(item.officialname)
        hb.setKey(generateDSId(item.id))
-        if (item.openAccess)
+        if (item.openAccess) {
          i.setAccessright(getOpenAccessQualifier())
-        val ar = getOpenAccessQualifier()
+          i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
-        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
+        }
      }
      else {
        hb = ModelConstants.UNKNOWN_REPOSITORY
@ -161,17 +244,8 @@ object DoiBoostMappingUtil {
      i.setHostedby(hb)
    })
-    val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
+    publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance()))
-    if (ar.nonEmpty) {
+
      if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){
        val ar = getOpenAccessQualifier()
        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
      }
      else {
        val ar = getRestrictedQualifier()
        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
      }
    }
    publication
  }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -4,7 +4,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf._
 import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
 import eu.dnetlib.dhp.utils.DHPUtils
-import eu.dnetlib.doiboost.DoiBoostMappingUtil._
+import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _}
 import org.apache.commons.lang.StringUtils
 import org.json4s
 import org.json4s.DefaultFormats
@ -168,12 +168,22 @@ case object Crossref2Oaf {
    // Mapping instance
    val instance = new Instance()
    val license = for {
-      JString(lic) <- json \ "license" \ "URL"
+      JObject(license) <- json \ "license"
-    } yield asField(lic)
+      JField("URL", JString(lic)) <- license
-    val l = license.filter(d => StringUtils.isNotBlank(d.getValue))
+      JField("content-version", JString(content_version)) <- license
-    if (l.nonEmpty)
+    } yield (asField(lic), content_version)
-      instance.setLicense(l.head)
+    val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue))
-
+    if (l.nonEmpty){
      if (l exists (d => d._2.equals("vor"))){
        for(d <- l){
          if (d._2.equals("vor")){
            instance.setLicense(d._1)
          }
        }
      }
      else{
        instance.setLicense(l.head._1)}
    }
    // Ticket #6281 added pid to Instance
    instance.setPid(result.getPid)
@ -185,7 +195,7 @@ case object Crossref2Oaf {
        OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
    }
-    instance.setAccessright(getRestrictedQualifier())
+    instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue))
    instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
    result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4),   cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala
@ -11,6 +11,7 @@ import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
 import eu.dnetlib.doiboost.DoiBoostMappingUtil._
 import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color
@ -23,6 +24,21 @@ case class OALocation(evidence:Option[String], host_type:Option[String], is_best
 object UnpayWallToOAF {
  val logger: Logger = LoggerFactory.getLogger(getClass)
  def get_unpaywall_color(input:String):Option[OpenAccessRoute] = {
    if(input == null || input.equalsIgnoreCase("close"))
      return None
    if(input.equalsIgnoreCase("green"))
      return Some(OpenAccessRoute.green)
    if(input.equalsIgnoreCase("bronze"))
      return Some(OpenAccessRoute.bronze)
    if(input.equalsIgnoreCase("hybrid"))
      return Some(OpenAccessRoute.hybrid)
    else
      return Some(OpenAccessRoute.gold)
  }
  def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = {
    if (is_oa) {
      if (location.host_type.isDefined) {
@ -65,7 +81,7 @@ object UnpayWallToOAF {
    val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
-    val colour = get_color(is_oa, oaLocation, journal_is_oa)
+    val colour = get_unpaywall_color((json \ "oa_status").extractOrElse[String](null))
    pub.setCollectedfrom(List(createUnpayWallCollectedFrom()).asJava)
    pub.setDataInfo(generateDataInfo())
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Generate DOIBoost ActionSet for BETA - PREPROCESS" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sparkDriverMemory</name>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="Generate DOIBoost ActionSet for PROD" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Generate DOIBoost ActionSet for BETA - PROCESS" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sparkDriverMemory</name>
@ -100,7 +100,7 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -125,7 +125,7 @@
                --executor-memory=${sparkExecutorIntersectionMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala
@ -492,6 +492,124 @@ class CrossrefMappingTest {
  }
  @Test
  def testLicenseVorClosed() :Unit = {
    val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_vor.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty);
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
    assertTrue(resultList.nonEmpty)
    val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
    println(mapper.writeValueAsString(item))
    assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
  }
  @Test
  def testLicenseOpen() :Unit = {
    val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_open.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty);
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
    assertTrue(resultList.nonEmpty)
    val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
    assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
    println(mapper.writeValueAsString(item))
  }
  @Test
  def testLicenseEmbargoOpen() :Unit = {
    val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_open.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty);
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
    assertTrue(resultList.nonEmpty)
    val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
    assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
    println(mapper.writeValueAsString(item))
  }
  @Test
  def testLicenseEmbargo() :Unit = {
    val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty);
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
    assertTrue(resultList.nonEmpty)
    val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
    assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
    println(mapper.writeValueAsString(item))
  }
  @Test
  def testLicenseEmbargoDateTime() :Unit = {
    val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_datetime.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty);
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
    assertTrue(resultList.nonEmpty)
    val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
    assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
    assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
    println(mapper.writeValueAsString(item))
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_embargo.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_embargo.json
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_open.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_open.json
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_vor.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/publication_license_vor.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -71,6 +71,8 @@ public abstract class AbstractMdRecordToOafMapper {
 	private final boolean shouldHashId;
 	private final boolean forceOriginalId;
 	protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
 	protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
 	protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@ -98,11 +100,20 @@ public abstract class AbstractMdRecordToOafMapper {
 		nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
 	}
 	protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
 		final boolean shouldHashId, final boolean forceOriginalId) {
 		this.vocs = vocs;
 		this.invisible = invisible;
 		this.shouldHashId = shouldHashId;
 		this.forceOriginalId = forceOriginalId;
 	}
 	protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
 		final boolean shouldHashId) {
 		this.vocs = vocs;
 		this.invisible = invisible;
 		this.shouldHashId = shouldHashId;
 		this.forceOriginalId = false;
 	}
 	public List<Oaf> processMdRecord(final String xml) {
@ -190,11 +201,17 @@ public abstract class AbstractMdRecordToOafMapper {
 		final long lastUpdateTimestamp) {
 		final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
 		final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
 		originalId.add(entity.getId());
 		entity.setOriginalId(Lists.newArrayList(originalId));
 		if (!forceOriginalId) {
 			final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
 			if (!id.equals(entity.getId())) {
 			entity.getOriginalId().add(entity.getId());
 				entity.setId(id);
 			}
 		}
 		final List<Oaf> oafs = Lists.newArrayList(entity);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -163,11 +163,13 @@ public class GenerateEntitiesApplication {
 		switch (type.toLowerCase()) {
 			case "oaf-store-cleaned":
 			case "oaf-store-claim":
 				return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
 			case "oaf-store-claim":
 				return new OafToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
 			case "odf-store-cleaned":
 			case "odf-store-claim":
 				return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
 			case "odf-store-claim":
 				return new OdfToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
 			case "oaf-store-intersection":
 				return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
 			case "odf-store-intersection":
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@ -27,6 +27,11 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
 public class OafToOafMapper extends AbstractMdRecordToOafMapper {
 	public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
 		final boolean forceOrginalId) {
 		super(vocs, invisible, shouldHashId, forceOrginalId);
 	}
 	public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
 		super(vocs, invisible, shouldHashId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -22,6 +22,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 	public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
 	public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
 		final boolean forceOrginalId) {
 		super(vocs, invisible, shouldHashId, forceOrginalId);
 	}
 	public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
 		super(vocs, invisible, shouldHashId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
@ -0,0 +1,127 @@
 package eu.dnetlib.dhp.oa.graph.raw;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.FileNotFoundException;
 import java.util.Objects;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.graph.dump.Utils;
 import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import scala.Tuple2;
 public class PatchRelationsApplication {
 	private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					Optional
 						.ofNullable(
 							PatchRelationsApplication.class
 								.getResourceAsStream(
 									"/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json"))
 						.orElseThrow(FileNotFoundException::new)));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String graphBasePath = parser.get("graphBasePath");
 		log.info("graphBasePath: {}", graphBasePath);
 		final String workingDir = parser.get("workingDir");
 		log.info("workingDir: {}", workingDir);
 		final String idMappingPath = parser.get("idMappingPath");
 		log.info("idMappingPath: {}", idMappingPath);
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath));
 	}
 	/**
 	 * Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the
 	 * mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location.
 	 *
 	 * @param spark the SparkSession
 	 * @param graphBasePath base graph path providing the set of relations to patch
 	 * @param workingDir intermediate storage location
 	 * @param idMappingPath dataset providing the old -> new identifier mapping
 	 */
 	private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir,
 		final String idMappingPath) {
 		final String relationPath = graphBasePath + "/relation";
 		final Dataset<Relation> rels = Utils.readPath(spark, relationPath, Relation.class);
 		final Dataset<RelationIdMapping> idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class);
 		log.info("relations: {}", rels.count());
 		log.info("idMapping: {}", idMapping.count());
 		final Dataset<Relation> bySource = rels
 			.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
 			.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
 				final Relation r = t._1();
 				Optional
 					.ofNullable(t._2())
 					.map(RelationIdMapping::getNewId)
 					.ifPresent(r::setSource);
 				return r;
 			}, Encoders.bean(Relation.class));
 		bySource
 			.joinWith(idMapping, bySource.col("target").equalTo(idMapping.col("oldId")), "left")
 			.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
 				final Relation r = t._1();
 				Optional
 					.ofNullable(t._2())
 					.map(RelationIdMapping::getNewId)
 					.ifPresent(r::setTarget);
 				return r;
 			}, Encoders.bean(Relation.class))
 			.map(
 				(MapFunction<Relation, String>) OBJECT_MAPPER::writeValueAsString,
 				Encoders.STRING())
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.text(workingDir);
 		spark
 			.read()
 			.textFile(workingDir)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.text(relationPath);
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
@ -0,0 +1,25 @@
 package eu.dnetlib.dhp.oa.graph.raw.common;
 public class RelationIdMapping {
 	private String oldId;
 	private String newId;
 	public String getOldId() {
 		return oldId;
 	}
 	public void setOldId(final String oldId) {
 		this.oldId = oldId;
 	}
 	public String getNewId() {
 		return newId;
 	}
 	public void setNewId(final String newId) {
 		this.newId = newId;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala
@ -0,0 +1,42 @@
 package eu.dnetlib.dhp.sx.graph
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 object SparkConvertDatasetToJsonRDD {
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val sourcePath = parser.get("sourcePath")
    log.info(s"sourcePath  -> $sourcePath")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath  -> $targetPath")
    val resultObject = List("publication","dataset","software", "otherResearchProduct")
    val mapper = new ObjectMapper()
    implicit  val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    resultObject.foreach{item =>
      spark.read.load(s"$sourcePath/$item").as[Result].map(r=> mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
    }
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
@ -0,0 +1,67 @@
 package eu.dnetlib.dhp.sx.graph
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 object SparkConvertRDDtoDataset {
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val sourcePath = parser.get("sourcePath")
    log.info(s"sourcePath  -> $sourcePath")
    val t = parser.get("targetPath")
    log.info(s"targetPath  -> $t")
    val entityPath = s"$t/entities"
    val relPath = s"$t/relation"
    val mapper = new ObjectMapper()
    implicit  val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
    implicit  val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
    implicit  val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
    implicit  val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
    implicit  val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
    log.info("Converting dataset")
    val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset]))
    spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
    log.info("Converting publication")
    val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication]))
    spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
    log.info("Converting software")
    val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software]))
    spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
    log.info("Converting otherresearchproduct")
    val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
    spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
    log.info("Converting Relation")
    val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation]))
    spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
@ -70,7 +70,7 @@ object SparkCreateInputGraph {
    resultObject.foreach { r =>
      log.info(s"Make ${r._1} unique")
-      makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/dedup/${r._1}",spark, r._2)
+      makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/preprocess/${r._1}",spark, r._2)
    }
  }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
@ -42,6 +42,7 @@ object SparkCreateScholix {
    val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
      .filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
      .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
    val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
@ -1,7 +1,7 @@
 package eu.dnetlib.dhp.sx.graph
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.Result
+import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
 import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
 import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
 import org.apache.commons.io.IOUtils
@ -29,11 +29,12 @@ object SparkCreateSummaryObject {
    log.info(s"targetPath  -> $targetPath")
    implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
    implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
-    val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result]
+    val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r=>r.getDataInfo== null ||  r.getDataInfo.getDeletedbyinference== false)
    ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath)
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala
@ -1,10 +1,17 @@
 package eu.dnetlib.dhp.sx.graph
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
 import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
@ -25,16 +32,97 @@ object SparkResolveRelation {
    val relationPath = parser.get("relationPath")
    log.info(s"sourcePath  -> $relationPath")
    val entityPath = parser.get("entityPath")
-    log.info(s"targetPath  -> $entityPath")
+    log.info(s"entityPath  -> $entityPath")
    val workingPath = parser.get("workingPath")
    log.info(s"workingPath  -> $workingPath")
    implicit  val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    implicit  val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
    import spark.implicits._
    val entities:Dataset[Result] = spark.read.load(s"$entityPath/*").as[Result]
    extractPidResolvedTableFromJsonRDD(spark, entityPath, workingPath)
    val mappper = new ObjectMapper()
    val rPid:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String,String)]
    val relationDs:Dataset[(String,Relation)] = spark.read.load(relationPath).as[Relation].map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
    relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map{
      m =>
        val sourceResolved = m._2
        val currentRelation = m._1._2
        if (sourceResolved!=null && sourceResolved._1!=null && sourceResolved._1.nonEmpty)
          currentRelation.setSource(sourceResolved._1)
        currentRelation
    }.write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/relationResolvedSource")
    val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
    relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map{
      m =>
        val targetResolved = m._2
        val currentRelation = m._1._2
        if (targetResolved!=null && targetResolved._1.nonEmpty)
          currentRelation.setTarget(targetResolved._1)
        currentRelation
    }.filter(r => r.getSource.startsWith("50")&& r.getTarget.startsWith("50"))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/relation_resolved")
    spark.read.load(s"$workingPath/relation_resolved").as[Relation]
                    .map(r => mappper.writeValueAsString(r))
                    .rdd.saveAsTextFile(s"$workingPath/relation", classOf[GzipCodec])
  }
  private def extractPidsFromRecord(input:String):(String,List[(String,String)]) = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input)
    val id:String = (json \ "id").extract[String]
    val result: List[(String,String)] = for {
      JObject(pids) <- json \ "pid"
      JField("value", JString(pidValue)) <- pids
      JField("qualifier", JObject(qualifier)) <- pids
      JField("classname", JString(pidType)) <- qualifier
    } yield (pidValue, pidType)
    (id,result)
  }
  private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = {
    import spark.implicits._
    val d: RDD[(String,String)] = spark.sparkContext.textFile(s"$entityPath/*")
      .map(i => extractPidsFromRecord(i))
      .filter(s => s != null && s._1!= null && s._2!=null && s._2.nonEmpty)
      .flatMap{ p =>
                  p._2.map(pid =>
                    (p._1, convertPidToDNETIdentifier(pid._1, pid._2))
                  )
      }.filter(r =>r._1 != null || r._2 != null)
    spark.createDataset(d)
    .groupByKey(_._2)
      .reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
      .map(s => s._2)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/relationResolvedPid")
  }
  /*
    This method should be used once we finally convert everythings in Kryo dataset
    instead of using rdd of json
   */
  private def extractPidResolvedTableFromKryo(spark: SparkSession, entityPath: String, workingPath: String) = {
    import spark.implicits._
    implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    val entities: Dataset[Result] = spark.read.load(s"$entityPath/*").as[Result]
    entities.flatMap(e => e.getPid.asScala
      .map(p =>
        convertPidToDNETIdentifier(p.getValue, p.getQualifier.getClassid))
@ -45,41 +133,9 @@ object SparkResolveRelation {
      .map(s => s._2)
      .write
      .mode(SaveMode.Overwrite)
-      .save(s"$workingPath/resolvedPid")
+      .save(s"$workingPath/relationResolvedPid")
    val rPid:Dataset[(String,String)] = spark.read.load(s"$workingPath/resolvedPid").as[(String,String)]
    val relationDs:Dataset[(String,Relation)] = spark.read.load(relationPath).as[Relation].map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
    relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_1")), "left").map{
      m =>
        val sourceResolved = m._2
        val currentRelation = m._1._2
        if (sourceResolved!=null && sourceResolved._2.nonEmpty)
          currentRelation.setSource(sourceResolved._2)
        currentRelation
    }.write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/resolvedSource")
    val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/resolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
    relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_1")), "left").map{
      m =>
        val targetResolved = m._2
        val currentRelation = m._1._2
        if (targetResolved!=null && targetResolved._2.nonEmpty)
          currentRelation.setTarget(targetResolved._2)
        currentRelation
    }.filter(r => r.getSource.startsWith("50")&& r.getTarget.startsWith("50"))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/resolvedRelation")
  }
  def convertPidToDNETIdentifier(pid:String, pidType: String):String = {
    if (pid==null || pid.isEmpty || pidType== null || pidType.isEmpty)
      null
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
@ -199,7 +199,7 @@ object BioDBToOAF {
        d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
      }
      val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
-        .map(date => OafMapperUtils.structuredProperty(date.date, "UNKNOWN", "UNKNOWN", ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
+        .map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
      if (relevant_dates != null && relevant_dates.nonEmpty)
        d.setRelevantdate(relevant_dates.asJava)
      d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
@ -218,12 +218,12 @@ object BioDBToOAF {
    if (references_pmid != null && references_pmid.nonEmpty) {
-      val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), "relationship", "isRelatedTo", if  (i_date.isDefined) i_date.get.date else null)
+      val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if  (i_date.isDefined) i_date.get.date else null)
      rel.getCollectedfrom
      List(d, rel)
    }
    else if (references_doi != null && references_doi.nonEmpty) {
-      val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), "relationship", "isRelatedTo", if  (i_date.isDefined) i_date.get.date else null)
+      val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if  (i_date.isDefined) i_date.get.date else null)
      List(d, rel)
    }
    else
@ -243,7 +243,7 @@ object BioDBToOAF {
    rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
    rel.setDataInfo(DATA_INFO)
-    rel.setRelType("resultResult")
+    rel.setRelType(ModelConstants.RESULT_RESULT)
    rel.setSubRelType(subRelType)
    rel.setRelClass(relClass)
@ -263,7 +263,7 @@ object BioDBToOAF {
  def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date:String): Relation = {
-    createRelation(pid,pidType,sourceId,collectedFrom, "supplement","IsSupplementTo", date)
+    createRelation(pid,pidType,sourceId,collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
  }
@ -392,6 +392,6 @@ object BioDBToOAF {
    i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
    d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
-    List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"),"relationship", "isRelatedTo", GraphCleaningFunctions.cleanDate(input.date)))
+    List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
@ -16,7 +16,7 @@ object PubMedToOaf {
  )
  def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
-    val result_typologies = getVocabularyTerm("dnet:result_typologies", vocabularies, cobjQualifier.getClassid)
+    val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
    result_typologies.getClassid match {
      case "dataset" => new Dataset
      case "publication" => new Publication
@ -68,11 +68,11 @@ object PubMedToOaf {
    //else We have to find a terms that match the vocabulary otherwise we discard it
    val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
    if (ja.isDefined) {
-      val cojbCategory = getVocabularyTerm("dnet:publication_resource", vocabularies, ja.get.getValue)
+      val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
      i.setInstancetype(cojbCategory)
    } else {
      val i_type = article.getPublicationTypes.asScala
-        .map(s => getVocabularyTerm("dnet:publication_resource", vocabularies, s.getValue))
+        .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
        .find(q => q != null)
      if (i_type.isDefined)
        i.setInstancetype(i_type.get)
@ -112,7 +112,7 @@ object PubMedToOaf {
    if (article.getLanguage != null) {
-      val term = vocabularies.getSynonymAsQualifier("dnet:languages", article.getLanguage)
+      val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
      if (term != null)
        result.setLanguage(term)
    }
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json
@ -0,0 +1,26 @@
 [
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "g",
    "paramLongName": "graphBasePath",
    "paramDescription": "base graph path providing the set of relations to patch",
    "paramRequired": true
  },
  {
    "paramName": "w",
    "paramLongName": "workingDir",
    "paramDescription": "intermediate storage location",
    "paramRequired": true
  },
  {
    "paramName": "i",
    "paramLongName": "idMappingPath",
    "paramDescription": "dataset providing the old -> new identifier mapping",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@ -100,6 +100,16 @@
            <value></value>
            <description>a blacklist of nsprefixes (comma separeted)</description>
        </property>
        <property>
            <name>shouldPatchRelations</name>
            <value>false</value>
            <description>activates the relation patching phase, driven by the content in ${idMappingPath}</description>
        </property>
        <property>
            <name>idMappingPath</name>
            <value></value>
            <description>path pointing to the relations identifiers mapping dataset</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -551,7 +561,6 @@
        <path start="merge_claims_relation"/>
    </fork>
    <action name="merge_claims_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -760,7 +769,42 @@
        <error to="Kill"/>
    </action>
-    <join name="wait_merge" to="End"/>
+    <join name="wait_merge" to="decisionPatchRelations"/>
    <decision name="decisionPatchRelations">
        <switch>
            <case to="patchRelations">
                ${(shouldPatchRelations eq "true") and
                (fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
            </case>
            <default to="End"/>
        </switch>
    </decision>
    <action name="patchRelations">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>PatchRelations</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--graphBasePath</arg><arg>${graphOutputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
            <arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json
@ -0,0 +1,5 @@
 [
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": true},
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/extractEntities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/extractEntities/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/extractEntities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/extractEntities/oozie_app/workflow.xml
@ -0,0 +1,85 @@
 <workflow-app name="Create Raw Graph Step 1: extract Entities in raw graph" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the working dir base path</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the graph Raw base path</description>
        </property>
    </parameters>
    <start to="ExtractEntities"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ExtractEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Extract entities in raw graph</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkCreateInputGraph</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=2000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
        </spark>
        <ok to="DropDedupPath"/>
        <error to="Kill"/>
    </action>
    <action name="DropDedupPath">
        <fs>
            <delete path='${targetPath}/dedup'/>
            <mkdir path='${targetPath}/dedup/'/>
        </fs>
        <ok to="GenerateInputGraphForDedup"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateInputGraphForDedup">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Generate Input Graph for deduplication</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkConvertDatasetToJsonRDD</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=3000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--sourcePath</arg><arg>${targetPath}/preprocess</arg>
            <arg>--targetPath</arg><arg>${targetPath}/dedup</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="Create Raw Graph Step 1: extract Entities in raw graph" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Create Scholix final Graph" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
@ -6,48 +6,22 @@
        </property>
        <property>
            <name>targetPath</name>
-            <description>the graph Raw base path</description>
+            <description>the final graph path</description>
        </property>
    </parameters>
-    <start to="ExtractEntities"/>
+    <start to="ImportDatasetEntities"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    <action name="ExtractEntities">
+    <action name="ImportDatasetEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Extract entities in raw graph</name>
+            <name>Import JSONRDD to Dataset kryo</name>
-            <class>eu.dnetlib.dhp.sx.graph.SparkCreateInputGraph</class>
+            <class>eu.dnetlib.dhp.sx.graph.SparkConvertRDDtoDataset</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=2000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
        </spark>
        <ok to="ResolveRelations"/>
        <error to="Kill"/>
    </action>
    <action name="ResolveRelations">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Resolve Relations in raw graph</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkResolveRelation</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
@ -60,9 +34,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
-            <arg>--relationPath</arg><arg>${targetPath}/extracted/relation</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--workingPath</arg><arg>${targetPath}/resolved/</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--entityPath</arg><arg>${targetPath}/dedup</arg>
        </spark>
        <ok to="CreateSummaries"/>
        <error to="Kill"/>
@ -87,7 +60,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
-            <arg>--sourcePath</arg><arg>${targetPath}/dedup</arg>
+            <arg>--sourcePath</arg><arg>${targetPath}/entities</arg>
            <arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
        </spark>
        <ok to="CreateScholix"/>
@ -114,7 +87,7 @@
            <arg>--master</arg><arg>yarn</arg>
            <arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
            <arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
-            <arg>--relationPath</arg><arg>${targetPath}/resolved/resolvedRelation</arg>
+            <arg>--relationPath</arg><arg>${targetPath}/relation</arg>
        </spark>
        <ok to="DropJSONPath"/>
@ -182,9 +155,5 @@
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/workflow.xml
@ -0,0 +1,62 @@
 <workflow-app name="Resolve Relation" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>entityPath</name>
            <description>the path of deduplicate Entities</description>
        </property>
        <property>
            <name>relationPath</name>
            <description>the path of relation unresolved</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the path of relation unresolved</description>
        </property>
    </parameters>
    <start to="DropRelFolder"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="DropRelFolder">
        <fs>
            <delete path='${targetPath}/relation'/>
            <delete path='${targetPath}/relation_resolved'/>
            <delete path='${targetPath}/resolvedSource'/>
            <delete path='${targetPath}/resolvedPid'/>
        </fs>
        <ok to="ResolveRelations"/>
        <error to="Kill"/>
    </action>
    <action name="ResolveRelations">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Resolve Relations in raw graph</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkResolveRelation</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=3000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--relationPath</arg><arg>${relationPath}</arg>
            <arg>--workingPath</arg><arg>${targetPath}</arg>
            <arg>--entityPath</arg><arg>${entityPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml
@ -1,120 +0,0 @@
 <workflow-app name="Create Raw Graph Step 2: Map XML to OAF Entities" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>workingPath</name>
            <description>the working path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
    </parameters>
    <start to="ExtractDLIPublication"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ExtractDLIPublication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Extract DLI Entities (Publication)</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=5000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>-e</arg><arg>publication</arg>
        </spark>
        <ok to="ExtractDLIDataset"/>
        <error to="Kill"/>
    </action>
    <action name="ExtractDLIDataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Extract DLI Entities (Dataset)</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=5000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>-e</arg><arg>dataset</arg>
        </spark>
        <ok to="ExtractDLIUnknown"/>
        <error to="Kill"/>
    </action>
    <action name="ExtractDLIUnknown">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Extract DLI Entities (Unknown)</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=5000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>-e</arg><arg>unknown</arg>
        </spark>
        <ok to="ExtractDLIRelation"/>
        <error to="Kill"/>
    </action>
    <action name="ExtractDLIRelation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Extract DLI Entities (Relation)</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=5000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>-e</arg><arg>relation</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml
@ -1,61 +0,0 @@
 <workflow-app name="Create Raw Graph Final Step: Construct the Scholexplorer Raw Graph" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the source path</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the source path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>entity</name>
            <description>the entity to be merged</description>
        </property>
    </parameters>
    <start to="DeleteTargetPath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="DeleteTargetPath">
        <fs>
            <mkdir path="${targetPath}"/>  
            <delete path='${targetPath}/${entity}'/>            
        </fs>
        <ok to="MergeDLIEntities"/>
        <error to="Kill"/>
    </action>
    <action name="MergeDLIEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Merge ${entity}</name>
            <class>eu.dnetlib.dhp.sx.graph.SparkScholexplorerCreateRawGraphJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>  --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/${entity}</arg>
            <arg>--targetPath</arg><arg>${targetPath}/${entity}</arg>
            <arg>--entity</arg><arg>${entity}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -1,11 +1,9 @@
 package eu.dnetlib.dhp.oa.graph.raw;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
-import static org.junit.jupiter.api.Assertions.assertFalse;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.mockito.Mockito.lenient;
 import java.io.IOException;
@ -25,15 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Instance;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.Software;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.dhp.schema.oaf.utils.PidType;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -74,7 +64,7 @@ public class MappersTest {
 		assertValidId(p.getId());
-		assertEquals(1, p.getOriginalId().size());
+		assertEquals(2, p.getOriginalId().size());
 		assertTrue(p.getOriginalId().contains("10.3897/oneeco.2.e13718"));
 		assertValidId(p.getCollectedfrom().get(0).getKey());
@ -261,8 +251,8 @@ public class MappersTest {
 		final Relation r2 = (Relation) list.get(2);
 		assertValidId(d.getId());
-		assertEquals(1, d.getOriginalId().size());
+		assertEquals(2, d.getOriginalId().size());
-		assertTrue(d.getOriginalId().contains("oai:zenodo.org:3234526"));
+		assertTrue(d.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:zenodo.org:3234526")));
 		assertValidId(d.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
 		assertTrue(d.getAuthor().size() > 0);
@ -351,8 +341,11 @@ public class MappersTest {
 		final Publication p = (Publication) list.get(0);
 		assertValidId(p.getId());
-		assertTrue(p.getOriginalId().size() == 1);
+		assertEquals(2, p.getOriginalId().size());
-		assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
+
 		assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739")));
 		// assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(p.getAuthor().size() > 0);
@ -413,7 +406,8 @@ public class MappersTest {
 		assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, d.getDataInfo().getProvenanceaction().getSchemename());
 		assertValidId(d.getId());
-		assertTrue(d.getOriginalId().size() == 1);
+		assertEquals(2, d.getOriginalId().size());
 		assertEquals("feabb67c-1fd1-423b-aec6-606d04ce53c6", d.getOriginalId().get(0));
 		assertValidId(d.getCollectedfrom().get(0).getKey());
@ -567,6 +561,31 @@ public class MappersTest {
 		assertNotNull(d.getInstance().get(0).getUrl());
 	}
 	@Test
 	void testEnermaps() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("enermaps.xml"));
 		final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
 		System.out.println("***************");
 		System.out.println(new ObjectMapper().writeValueAsString(list));
 		System.out.println("***************");
 		assertEquals(1, list.size());
 		assertTrue(list.get(0) instanceof Dataset);
 		final Dataset d = (Dataset) list.get(0);
 		assertValidId(d.getId());
 		assertValidId(d.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
 		assertEquals(1, d.getAuthor().size());
 		assertEquals(1, d.getInstance().size());
 		assertNotNull(d.getInstance().get(0).getUrl());
 		assertNotNull(d.getContext());
 		assertTrue(StringUtils.isNotBlank(d.getContext().get(0).getId()));
 		assertEquals("enermaps::selection::tgs00004", d.getContext().get(0).getId());
 	}
 	@Test
 	void testClaimFromCrossref() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
@ -648,6 +667,30 @@ public class MappersTest {
 		System.out.println(p.getTitle().get(0).getValue());
 	}
 	@Test
 	void testJairo() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
 		final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
 		System.out.println("***************");
 		System.out.println(new ObjectMapper().writeValueAsString(list));
 		System.out.println("***************");
 		final Publication p = (Publication) list.get(0);
 		assertValidId(p.getId());
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertNotNull(p.getTitle());
 		assertFalse(p.getTitle().isEmpty());
 		assertTrue(p.getTitle().size() == 1);
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 		final Publication p_cleaned = cleanup(fixVocabularyNames(p));
 		assertNotNull(p_cleaned.getTitle());
 		assertFalse(p_cleaned.getTitle().isEmpty());
 	}
 	@Test
 	void testOdfFromHdfs() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
@ -663,8 +706,8 @@ public class MappersTest {
 		final Dataset p = (Dataset) list.get(0);
 		assertValidId(p.getId());
-		assertTrue(p.getOriginalId().size() == 1);
+		assertEquals(2, p.getOriginalId().size());
-		assertEquals("df76e73f-0483-49a4-a9bb-63f2f985574a", p.getOriginalId().get(0));
+		assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("df76e73f-0483-49a4-a9bb-63f2f985574a")));
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(p.getAuthor().size() > 0);
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationApplicationTest.java
@ -0,0 +1,115 @@
 package eu.dnetlib.dhp.oa.graph.raw;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class PatchRelationApplicationTest {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static final String ID_MAPPING_PATH = "map/id_mapping.json";
 	private static SparkSession spark;
 	private static Path workingDir;
 	private static final Logger log = LoggerFactory.getLogger(PatchRelationApplicationTest.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files
 			.createTempDirectory(PatchRelationApplicationTest.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(PatchRelationApplicationTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(PatchRelationApplicationTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 		FileUtils
 			.copyInputStreamToFile(
 				PatchRelationApplicationTest.class.getResourceAsStream("id_mapping.json"),
 				workingDir.resolve(ID_MAPPING_PATH).toFile());
 		FileUtils
 			.copyInputStreamToFile(
 				PatchRelationApplicationTest.class.getResourceAsStream("relations_to_patch.json"),
 				workingDir.resolve("graphBasePath/relation/rels.json").toFile());
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	public void testPatchRelationApplication() throws Exception {
 		final String graphBasePath = workingDir.toString() + "/graphBasePath";
 		PatchRelationsApplication.main(new String[] {
 			"-isSparkSessionManaged", Boolean.FALSE.toString(),
 			"-graphBasePath", graphBasePath,
 			"-workingDir", workingDir.toString() + "/workingDir",
 			"-idMappingPath", workingDir.toString() + "/" + ID_MAPPING_PATH
 		});
 		final List<Relation> rels = spark
 			.read()
 			.textFile(graphBasePath + "/relation")
 			.map(
 				(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
 				Encoders.bean(Relation.class))
 			.collectAsList();
 		assertEquals(6, rels.size());
 		assertEquals(0, getCount(rels, "1a"), "should be patched to 1b");
 		assertEquals(0, getCount(rels, "2a"), "should be patched to 2b");
 		assertEquals(2, getCount(rels, "10a"), "not included in patching");
 		assertEquals(2, getCount(rels, "20a"), "not included in patching");
 		assertEquals(2, getCount(rels, "15a"), "not included in patching");
 		assertEquals(2, getCount(rels, "25a"), "not included in patching");
 		assertEquals(2, getCount(rels, "1b"), "patched from 1a");
 		assertEquals(2, getCount(rels, "2b"), "patched from 2a");
 	}
 	private long getCount(List<Relation> rels, final String id) {
 		return rels.stream().filter(r -> r.getSource().equals(id) || r.getTarget().equals(id)).count();
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml
@ -0,0 +1,72 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <record xmlns="http://datacite.org/schema/kernel-4"
        xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:oaf="http://namespace.openaire.eu/oaf">
    <oai:header xmlns="http://namespace.openaire.eu/"
                xmlns:dc="http://purl.org/dc/elements/1.1/"
                xmlns:dri="http://www.driver-repository.eu/namespace/dri"
                xmlns:oai="http://www.openarchives.org/OAI/2.0/"
                xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
        <dri:objIdentifier>enermaps____::04149ee428d07360314c2cb3ba95d41e</dri:objIdentifier>
        <dri:recordIdentifier>tgs00004</dri:recordIdentifier>
        <dri:dateOfCollection>2021-07-20T18:43:12.096+02:00</dri:dateOfCollection>
        <oaf:datasourceprefix>enermaps____</oaf:datasourceprefix>
    </oai:header>
    <metadata>
        <resource>
            <identifier identifierType="URL">https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004</identifier>
            <creators>
                <creator>
                    <creatorName>Statistical Office of the European Union (Eurostat)</creatorName>
                </creator>
            </creators>
            <titles>
                <title>
                    Regional GDP
                </title>
            </titles>
            <publisher>Statistical Office of the European Union (Eurostat)</publisher>
            <publicationYear>2020</publicationYear>
            <dates>
                <date dateType="Issued">2020-10-07</date>
            </dates>
            <resourceType resourceTypeGeneral="Dataset"/>
            <rightsList>
                <rights rightsURI="info:eu-repo/semantics/openAccess">OPEN</rights>
                <rights rightsURI="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</rights>
            </rightsList>
            <descriptions>
                <description descriptionType="Abstract" xml:lang="EN">GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy.</description>
            </descriptions>
            <dr:CobjCategory type="dataset">0021</dr:CobjCategory>
            <oaf:dateAccepted>2020-10-07</oaf:dateAccepted>
            <oaf:accessrights>OPEN</oaf:accessrights>
            <oaf:license>Creative Commons Attribution 4.0 International</oaf:license>
            <oaf:hostedBy
                    id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18" name="Unknown Repository"/>
            <oaf:collectedFrom id="enermaps____::db" name="Enermaps"/>
            <oaf:concept id="enermaps::selection::tgs00004"/>
        </resource>
    </metadata>
    <about xmlns="" xmlns:dc="http://purl.org/dc/elements/1.1/"
           xmlns:dri="http://www.driver-repository.eu/namespace/dri"
           xmlns:oai="http://www.openarchives.org/OAI/2.0/"
           xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
            <originDescription altered="true" harvestDate="2021-07-20T18:43:12.096+02:00">
                <baseURL>https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite</baseURL>
                <identifier/>
                <datestamp/>
                <metadataNamespace/>
            </originDescription>
        </provenance>
        <oaf:datainfo>
            <oaf:inferred>false</oaf:inferred>
            <oaf:deletedbyinference>false</oaf:deletedbyinference>
            <oaf:trust>0.9</oaf:trust>
            <oaf:inferenceprovenance/>
            <oaf:provenanceaction classid="sysimport:crosswalk"
                                  classname="sysimport:crosswalk"
                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
        </oaf:datainfo>
    </about>
 </record>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/id_mapping.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/id_mapping.json
@ -0,0 +1,5 @@
 {"oldId": "1a", "newId": "1b"}
 {"oldId": "2a", "newId": "2b"}
 {"oldId": "3a", "newId": "3b"}
 {"oldId": "4a", "newId": "4b"}
 {"oldId": "5a", "newId": "5b"}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
@ -0,0 +1,70 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <record xmlns:dc="http://purl.org/dc/elements/1.1/"
        xmlns:dr="http://www.driver-repository.eu/namespace/dr"
        xmlns:dri="http://www.driver-repository.eu/namespace/dri"
        xmlns:oaf="http://namespace.openaire.eu/oaf"
        xmlns:oai="http://www.openarchives.org/OAI/2.0/"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <header xmlns="http://namespace.openaire.eu/">
        <dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
        <dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
        <dri:dateOfCollection/>
        <dri:mdFormat/>
        <dri:mdFormatInterpretation/>
        <dri:repositoryId/>
        <dr:objectIdentifier/>
        <dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
        <dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
        <oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
    </header>
    <metadata xmlns="http://namespace.openaire.eu/">
        <dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
        <dc:creator>上原, 剛</dc:creator>
        <dc:creator>甲斐, 博</dc:creator>
        <dc:creator>野田, 松太郎</dc:creator>
        <dc:format>application/pdf</dc:format>
        <dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
        <dc:language>jpn</dc:language>
        <dc:publisher>京都大学数理解析研究所</dc:publisher>
        <dc:subject classid="ndc" classname="ndc"
                    schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
        <dc:type>Departmental Bulletin Paper</dc:type>
        <dr:CobjCategory type="publication">0014</dr:CobjCategory>
        <oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
        <oaf:projectid/>
        <oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
        <oaf:accessrights>OPEN</oaf:accessrights>
        <oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
        <oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
        <oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
        <oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
        <oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
        <oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
        <oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
    </metadata>
    <about>
        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
            <originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
                <baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
                <identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
                <datestamp>2021-04-13T13:36:29Z</datestamp>
                <metadataNamespace/>
                <originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
                    <baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
                    <identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
                    <datestamp>2012-07-12T14:15:41Z</datestamp>
                    <metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
                </originDescription>
            </originDescription>
        </provenance>
        <oaf:datainfo>
            <oaf:inferred>false</oaf:inferred>
            <oaf:deletedbyinference>false</oaf:deletedbyinference>
            <oaf:trust>0.9</oaf:trust>
            <oaf:inferenceprovenance/>
            <oaf:provenanceaction classid="sysimport:crosswalk:repository"
                                  classname="sysimport:crosswalk:repository"
                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
        </oaf:datainfo>
    </about>
 </record>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relations_to_patch.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relations_to_patch.json
@ -0,0 +1,6 @@
 {"source":"1a","target":"10a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
 {"source":"10a","target":"1a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
 {"source":"2a","target":"20a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
 {"source":"20a","target":"2a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
 {"source":"15a","target":"25a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
 {"source":"25a","target":"15a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/oaf_to_summary
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/oaf_to_summary
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -9,6 +9,41 @@
    <artifactId>dhp-graph-provision</artifactId>
    <build>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>4.0.1</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <phase>initialize</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-test-compile</id>
                        <phase>process-test-resources</phase>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <args>
                        <arg>-Xmax-classfile-name</arg>
                        <arg>200</arg>
                    </args>
                    <scalaVersion>${scala.version}</scalaVersion>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <dependencies>
        <dependency>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@ -10,6 +10,7 @@ import java.util.Set;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@ -81,6 +82,7 @@ public class PrepareRelationsJob {
 		Set<String> relationFilter = Optional
 			.ofNullable(parser.get("relationFilter"))
 			.map(String::toLowerCase)
 			.map(s -> Sets.newHashSet(Splitter.on(",").split(s)))
 			.orElse(new HashSet<>());
 		log.info("relationFilter: {}", relationFilter);
@ -130,7 +132,7 @@ public class PrepareRelationsJob {
 		JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
 			.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
-			.filter(rel -> relationFilter.contains(rel.getRelClass()) == false);
+			.filter(rel -> relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())) == false);
 		JavaRDD<Relation> pruned = pruneRels(
 			pruneRels(
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 public class XmlRecordFactory implements Serializable {
@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable {
 						.getOriginalId()
 						.stream()
 						.filter(Objects::nonNull)
 						.filter(id -> !id.matches("^\\d{2}" + IdentifierFactory.ID_PREFIX_SEPARATOR))
 						.map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
 						.collect(Collectors.toList()));
 		}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java
@ -71,6 +71,9 @@ public class DropAndCreateESIndex {
 			log.info(STATUS_CODE_TEXT, response.getStatusLine());
 		}
 		log.info("Sleeping 60 seconds to avoid to lost the creation of index request");
 		Thread.sleep(60000);
 		try (CloseableHttpClient client = HttpClients.createDefault()) {
 			final String summaryConf = IOUtils
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
@ -21,8 +21,10 @@ import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
 import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
 import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
 import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
@ -131,4 +133,32 @@ public class XmlRecordFactoryTest {
 		System.out.println(doc.asXML());
 		assertEquals("", doc.valueOf("//rel/validated"));
 	}
 	@Test
 	public void testEnermapsRecord() throws IOException, DocumentException {
 		String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
 			+
 			"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
 			"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
 			"</entries>";
 		ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
 		XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
 			otherDsTypeId);
 		Dataset d = OBJECT_MAPPER
 			.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
 		JoinedEntity je = new JoinedEntity<>(d);
 		String xml = xmlRecordFactory.build(je);
 		assertNotNull(xml);
 		Document doc = new SAXReader().read(new StringReader(xml));
 		assertNotNull(doc);
 		System.out.println(doc.asXML());
 		assertEquals("enermaps::selection::tgs00004", doc.valueOf("//concept/@id"));
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh
@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH
 hdfs dfs -copyToLocal $SCRIPT_PATH
 echo "Creating indicators"
-impala-shell -d ${TARGET} -q "invalidate metadata"
+impala-shell -q "invalidate metadata"
 impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f -
 cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f -
 echo "Indicators created"
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
@ -57,12 +57,14 @@ UNION ALL
 SELECT * FROM ${stats_db_name}.software_sources
 UNION ALL
 SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
--
+
-- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS;
+
-- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS FOR COLUMNS;
+create table ${stats_db_name}.result_orcid as
-- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS;
+select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
-- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS FOR COLUMNS;
+from (
-- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS;
+    SELECT substr(res.id, 4) as id, auth_pid.value as orcid
-- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS FOR COLUMNS;
+    FROM ${openaire_db_name}.result res
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS;
+    LATERAL VIEW explode(author) a as auth
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS FOR COLUMNS;
+    LATERAL VIEW explode(auth.pid) ap as auth_pid
    LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
    WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
@ -34,12 +34,3 @@ union all
 select * from ${stats_db_name}.software_refereed
 union all
 select * from ${stats_db_name}.otherresearchproduct_refereed;
 --
 -- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS FOR COLUMNS;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
@ -40,3 +40,197 @@ join result_instance ri on ri.id = p.id
 join datasource on datasource.id = ri.hostedby
 where datasource.id like '%doajarticles%') tmp
 on p.id= tmp.id;
 create table indi_project_pubs_count stored as parquet as
 select  pr.id id, count(p.id) total_pubs from project_results pr
 join publication p on p.id=pr.result
 group by pr.id;
 create table indi_project_datasets_count stored as parquet as
 select pr.id id, count(d.id) total_datasets from project_results pr
 join dataset d on d.id=pr.result
 group by pr.id;
 create table indi_project_software_count stored as parquet as
 select  pr.id id, count(s.id) total_software from project_results pr
 join software s on s.id=pr.result
 group by pr.id;
 create table indi_project_otherresearch_count stored as parquet as
 select pr.id id, count(o.id) total_other from project_results pr
 join otherresearchproduct o on o.id=pr.result
 group by pr.id;
 create table indi_pub_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
 round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
 from
 (SELECT year, country, SUM(CASE
    WHEN bestlicence='Open Access' THEN 1
 ELSE 0
 END) AS OpenAccess, SUM(CASE
 WHEN bestlicence<>'Open Access' THEN 1
 ELSE 0
 END) AS NonOpenAccess
 FROM publication p
 join result_organization ro on p.id=ro.id
 join organization o on o.id=ro.organization
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by year, country) tmp;
 create table indi_dataset_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
 round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
 from
 (SELECT year, country, SUM(CASE
    WHEN bestlicence='Open Access' THEN 1
 ELSE 0
 END) AS OpenAccess, SUM(CASE
 WHEN bestlicence<>'Open Access' THEN 1
 ELSE 0
 END) AS NonOpenAccess
 FROM dataset d
 join result_organization ro on d.id=ro.id
 join organization o on o.id=ro.organization
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by year, country) tmp;
 create table indi_software_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
 round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
 from
 (SELECT year, country, SUM(CASE
    WHEN bestlicence='Open Access' THEN 1
 ELSE 0
 END) AS OpenAccess, SUM(CASE
 WHEN bestlicence<>'Open Access' THEN 1
 ELSE 0
 END) AS NonOpenAccess
 FROM software s
 join result_organization ro on s.id=ro.id
 join SOURCER.organization o on o.id=ro.organization
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by year, country) tmp;
 create table indi_other_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
 round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
 from
 (SELECT year, country, SUM(CASE
    WHEN bestlicence='Open Access' THEN 1
 ELSE 0
 END) AS OpenAccess, SUM(CASE
 WHEN bestlicence<>'Open Access' THEN 1
 ELSE 0
 END) AS NonOpenAccess
 FROM otherresearchproduct orp
 join result_organization ro on orp.id=ro.id
 join organization o on o.id=ro.organization
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by year, country) tmp;
 create table indi_pub_avg_year_context_oa stored as parquet as
 with total as
 (select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc
 join context c on pc.concept like concat('%',c.id,'%')
 join publication p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofpubs
 from total;
 create table indi_dataset_avg_year_context_oa stored as parquet as
 with total as
 (select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc
 join context c on pc.concept like concat('%',c.id,'%')
 join dataset p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofdataset
 from total;
 create table indi_software_avg_year_context_oa stored as parquet as
 with total as
 (select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc
 join context c on pc.concept like concat('%',c.id,'%')
 join software p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofsoftware
 from total;
 create table indi_other_avg_year_context_oa stored as parquet as
 with total as
 (select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc
 join context c on pc.concept like concat('%',c.id,'%')
 join otherresearchproduct p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofother
 from total;
 create table indi_other_avg_year_content_oa stored as parquet as
 with total as
 (select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
 from otherresearchproduct_datasources pd
 join datasource d on datasource=d.id
 join otherresearchproduct p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
 from total;
 create table indi_software_avg_year_content_oa stored as parquet as
 with total as
 (select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
 from software_datasources pd
 join datasource d on datasource=d.id
 join software p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
 from total;
 create table indi_dataset_avg_year_content_oa stored as parquet as
 with total as
 (select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
 from dataset_datasources pd
 join datasource d on datasource=d.id
 join dataset p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
 from total;
 create table indi_pub_avg_year_content_oa stored as parquet as
 with total as
 (select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
 from publication_datasources pd
 join datasource d on datasource=d.id
 join publication p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfPubs
 from total;
 create table indi_pub_has_cc_licence stored as parquet as
 select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
 from publication p
 left outer join (select p.id, license.type as lic from publication p
 join publication_licenses as license on license.id = p.id
 where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
 on p.id= tmp.id;
 create table indi_pub_has_cc_licence_url stored as parquet as
 select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url
 from publication p
 left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host
 from publication p
 join publication_licenses as license on license.id = p.id
 WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp
 on p.id= tmp.id;
 create table indi_pub_has_abstract stored as parquet as
 select distinct publication.id, coalesce(abstract, 1) has_abstract
 from publication;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
@ -90,27 +90,8 @@ FROM ${openaire_db_name}.publication p
 where p.datainfo.deletedbyinference = false;
 CREATE TABLE ${stats_db_name}.publication_citations AS
-SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result
+SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.publication p
         lateral view explode(p.extrainfo) citations AS citation
 WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
  and p.datainfo.deletedbyinference = false;
 -- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS FOR COLUMNS;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
@ -116,6 +116,13 @@ compute stats TARGET.indi_pub_doi_from_crossref;
 create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.indi_pub_gold_oa;
 create view TARGET.indi_dataset_avg_year_country_oa as select * from SOURCE.indi_dataset_avg_year_country_oa orig;
 create view TARGET.indi_project_datasets_count as select * from SOURCE.indi_project_datasets_count orig;
 create view TARGET.indi_project_otherresearch_count as select * from SOURCE.indi_project_otherresearch_count orig;
 create view TARGET.indi_project_pubs_count as select * from SOURCE.indi_project_pubs_count orig;
 create view TARGET.indi_project_software_count as select * from SOURCE.indi_project_software_count orig;
 create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig;
 --denorm
 alter table TARGET.result rename to TARGET.res_tmp;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
@ -41,7 +41,7 @@ FROM ${openaire_db_name}.dataset d
 WHERE d.datainfo.deletedbyinference = FALSE;
 CREATE TABLE ${stats_db_name}.dataset_citations AS
-SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result
+SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.dataset d
         LATERAL VIEW explode(d.extrainfo) citations AS citation
 WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@ -96,20 +96,3 @@ SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subj
 FROM ${openaire_db_name}.dataset p
         LATERAL VIEW explode(p.subject) subjects AS subject
 where p.datainfo.deletedbyinference = false;
 --
 -- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS FOR COLUMNS;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
@ -41,7 +41,7 @@ from ${openaire_db_name}.software s
 where s.datainfo.deletedbyinference = false;
 CREATE TABLE ${stats_db_name}.software_citations AS
-SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT
+SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.software s
         LATERAL VIEW explode(s.extrainfo) citations as citation
 where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@ -96,20 +96,3 @@ SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subj
 FROM ${openaire_db_name}.software p
         LATERAL VIEW explode(p.subject) subjects AS subject
 where p.datainfo.deletedbyinference = false;
 --
 -- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS FOR COLUMNS;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
@ -41,7 +41,7 @@ WHERE o.datainfo.deletedbyinference = FALSE;
 -- Otherresearchproduct_citations
 CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS
-SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT
+SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
 WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
  and o.datainfo.deletedbyinference = false;
@ -87,20 +87,3 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS
 SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
 FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
 where p.datainfo.deletedbyinference = false;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS FOR COLUMNS;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@ -13,11 +13,17 @@ WHERE r.reltype = 'projectOrganization'
  and r.datainfo.deletedbyinference = false;
 CREATE TABLE ${stats_db_name}.project_results AS
-SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result
+SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
 FROM ${openaire_db_name}.relation r
 WHERE r.reltype = 'resultProject'
  and r.datainfo.deletedbyinference = false;
 create table ${stats_db_name}.project_classification as
 select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
 from ${openaire_db_name}.project p
    lateral view explode(p.h2020classification) classifs as class
 where p.datainfo.deletedbyinference=false and class.h2020programme is not null;
 CREATE TABLE ${stats_db_name}.project_tmp
 (
    id             STRING,
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@ -130,12 +130,7 @@ WHERE r.reltype = 'resultOrganization'
  and r.datainfo.deletedbyinference = false;
 CREATE TABLE ${stats_db_name}.result_projects AS
-select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend
+select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
 FROM ${stats_db_name}.result r
         JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
         JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
 -- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS FOR COLUMNS;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
@ -17,7 +17,9 @@ CREATE TABLE ${stats_db_name}.datasource_tmp
    `latitude`         STRING,
    `longitude`        STRING,
    `websiteurl`       STRING,
-    `compatibility`    STRING
+    `compatibility`    STRING,
    issn_printed       STRING,
    issn_online        STRING
 ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
 -- Insert statement that takes into account the piwik_id of the openAIRE graph
@ -32,7 +34,9 @@ SELECT substr(d1.id, 4)                                          AS id,
       d1.latitude.value                                         AS latitude,
       d1.longitude.value                                        AS longitude,
       d1.websiteurl.value                                       AS websiteurl,
-       d1.openairecompatibility.classid                          AS compatibility
+       d1.openairecompatibility.classid                          AS compatibility,
       d1.journal.issnprinted                                    AS issn_printed,
       d1.journal.issnonline                                    AS issn_online
 FROM ${openaire_db_name}.datasource d1
         LEFT OUTER JOIN
     (SELECT id, split(originalidd, '\\:')[1] as piwik_id
@ -51,7 +55,7 @@ CREATE TABLE ${stats_db_name}.dual
 INSERT INTO ${stats_db_name}.dual
 VALUES ('X');
 INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
-                                             `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`)
+                                             `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
 SELECT 'other',
       'Other',
       'Repository',
@ -62,7 +66,9 @@ SELECT 'other',
       NULL,
       NULL,
       NULL,
-       'unknown'
+       'unknown',
       null,
       null
 FROM ${stats_db_name}.dual
 WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
 DROP TABLE ${stats_db_name}.dual;
@ -98,12 +104,3 @@ where d.datainfo.deletedbyinference = false;
 CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
 SELECT datasource AS id, id AS result
 FROM ${stats_db_name}.result_datasources;
 -- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS FOR COLUMNS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS;
 -- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS FOR COLUMNS;
--- a/pom.xml
+++ b/pom.xml
@ -205,6 +205,11 @@
 				<artifactId>dateparser</artifactId>
 				<version>1.0.7</version>
 			</dependency>
 			<dependency>
 				<groupId>me.xuender</groupId>
 				<artifactId>unidecode</artifactId>
 				<version>0.0.7</version>
 			</dependency>
 			<dependency>
 				<groupId>com.google.guava</groupId>
@ -736,7 +741,7 @@
 		<mockito-core.version>3.3.3</mockito-core.version>
 		<mongodb.driver.version>3.4.2</mongodb.driver.version>
 		<vtd.version>[2.12,3.0)</vtd.version>
-		<dhp-schemas.version>[2.7.14]</dhp-schemas.version>
+		<dhp-schemas.version>[2.7.15]</dhp-schemas.version>
 		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
 		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
 		<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
`@ -1,2 +1,2 @@`
	`# dnet-hadoop`	`# dnet-hadoop`
	`Dnet-hadoop is a tool for`	`Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.`