refactoring of the procedure for the id generation, minor changes and addition of a comparation on the original id and the origin datasource

implementation of the dedup_id generation using pids to make the graph more stable
2020-07-24 20:10:47 +02:00 · 2020-07-22 17:29:48 +02:00
155 changed files with 1753 additions and 3925 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java
@ -1,22 +1,15 @@

 package eu.dnetlib.dhp.utils;

-import java.util.Map;
-
-import javax.xml.ws.BindingProvider;
-
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;

 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

 public class ISLookupClientFactory {

-	private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
-
-	private static int requestTimeout = 60000 * 10;
-	private static int connectTimeout = 60000 * 10;
+	private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);

 	public static ISLookUpService getLookUpService(final String isLookupUrl) {
 		return getServiceStub(ISLookUpService.class, isLookupUrl);
@ -28,25 +21,6 @@ public class ISLookupClientFactory {
 		final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
 		jaxWsProxyFactory.setServiceClass(clazz);
 		jaxWsProxyFactory.setAddress(endpoint);
-
-		final T service = (T) jaxWsProxyFactory.create();
-
-		if (service instanceof BindingProvider) {
-			log
-				.info(
-					"setting timeouts for {} to requestTimeout: {}, connectTimeout: {}",
-					BindingProvider.class.getName(), requestTimeout, connectTimeout);
-
-			Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
-
-			requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout);
-			requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout);
-			requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout);
-			requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout);
-			requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout);
-			requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout);
-		}
-
-		return service;
+		return (T) jaxWsProxyFactory.create();
 	}
 }
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@ -14,37 +14,6 @@

    <description>This module contains common schema classes meant to be used across the dnet-hadoop submodules</description>

-    <build>
-        <plugins>
-            <plugin>
-                <groupId>net.alchim31.maven</groupId>
-                <artifactId>scala-maven-plugin</artifactId>
-                <version>4.0.1</version>
-                <executions>
-                    <execution>
-                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
-                        <goals>
-                            <goal>add-source</goal>
-                            <goal>compile</goal>
-                        </goals>
-                    </execution>
-                    <execution>
-                        <id>scala-test-compile</id>
-                        <phase>process-test-resources</phase>
-                        <goals>
-                            <goal>testCompile</goal>
-                        </goals>
-                    </execution>
-                </executions>
-                <configuration>
-                    <scalaVersion>${scala.version}</scalaVersion>
-                </configuration>
-            </plugin>
-
-        </plugins>
-    </build>
-
    <dependencies>

        <dependency>
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
@ -1,6 +1,8 @@

 package eu.dnetlib.dhp.schema.common;

+import java.security.Key;
+
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
@ -1,90 +0,0 @@
-package eu.dnetlib.dhp.schema.scholexplorer
-
-import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
-
-object OafUtils {
-
-
-
-  def generateKeyValue(key: String, value: String): KeyValue = {
-    val kv: KeyValue = new KeyValue()
-    kv.setKey(key)
-    kv.setValue(value)
-    kv.setDataInfo(generateDataInfo("0.9"))
-    kv
-  }
-
-
-  def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
-    val di = new DataInfo
-    di.setDeletedbyinference(false)
-    di.setInferred(false)
-    di.setInvisible(false)
-    di.setTrust(trust)
-    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
-    di
-  }
-
-  def createQualifier(cls: String, sch: String): Qualifier = {
-    createQualifier(cls, cls, sch, sch)
-  }
-
-
-  def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
-    val q: Qualifier = new Qualifier
-    q.setClassid(classId)
-    q.setClassname(className)
-    q.setSchemeid(schemeId)
-    q.setSchemename(schemeName)
-    q
-  }
-
-
-  def asField[T](value: T): Field[T] = {
-    val tmp = new Field[T]
-    tmp.setValue(value)
-    tmp
-
-
-  }
-
-  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
-    val sp = new StructuredProperty
-    sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
-    sp.setValue(value)
-    sp
-
-  }
-
-
-
-  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
-    val sp = new StructuredProperty
-    sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
-    sp.setValue(value)
-    sp.setDataInfo(dataInfo)
-    sp
-
-  }
-
-  def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
-    val sp = new StructuredProperty
-    sp.setQualifier(createQualifier(classId, schemeId))
-    sp.setValue(value)
-    sp
-
-  }
-
-
-
-  def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
-    val sp = new StructuredProperty
-    sp.setQualifier(createQualifier(classId, schemeId))
-    sp.setValue(value)
-    sp.setDataInfo(dataInfo)
-    sp
-
-  }
-
-
-}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
@ -34,10 +34,7 @@ public class EventFactory {
 		final MappedFields map = createMapFromResult(updateInfo);

 		final String eventId = calculateEventId(
-			updateInfo.getTopicPath(), updateInfo.getTargetDs().getOpenaireId(), updateInfo
-				.getTarget()
-				.getOpenaireId(),
-			updateInfo.getHighlightValueAsString());
+			updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString());

 		res.setEventId(eventId);
 		res.setProducerId(PRODUCER_ID);
@ -96,13 +93,11 @@ public class EventFactory {
 		return map;
 	}

-	private static String calculateEventId(final String topic, final String dsId, final String publicationId,
-		final String value) {
+	private static String calculateEventId(final String topic, final String publicationId, final String value) {
 		return "event-"
-			+ DigestUtils.md5Hex(topic).substring(0, 4) + "-"
-			+ DigestUtils.md5Hex(dsId).substring(0, 4) + "-"
-			+ DigestUtils.md5Hex(publicationId).substring(0, 7) + "-"
-			+ DigestUtils.md5Hex(value).substring(0, 5);
+			+ DigestUtils.md5Hex(topic).substring(0, 6) + "-"
+			+ DigestUtils.md5Hex(publicationId).substring(0, 8) + "-"
+			+ DigestUtils.md5Hex(value).substring(0, 8);
 	}

 	private static long calculateExpiryDate(final long now) {
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
@ -48,13 +48,12 @@ public class IndexOnESJob {

 		final JavaRDD<String> inputRdd = ClusterUtils
 			.readPath(spark, eventsPath, Event.class)
+			// .limit(10000) // TODO REMOVE
 			.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
 			.javaRDD();

 		final Map<String, String> esCfg = new HashMap<>();
 		// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
-
-		esCfg.put("es.index.auto.create", "false");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
 		esCfg.put("es.batch.write.retry.count", "8");
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
@ -64,11 +64,182 @@
        </configuration>
    </global>

-    <start to="index_es"/>
+    <start to="join_entities_step0"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
+   
+   
+   <action name="join_entities_step0">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>JoinStep0</name>
+            <class>eu.dnetlib.dhp.broker.oa.JoinStep0Job</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="join_entities_step1"/>
+        <error to="Kill"/>
+    </action>
+    
+ <action name="join_entities_step1">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>JoinStep1</name>
+            <class>eu.dnetlib.dhp.broker.oa.JoinStep1Job</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="join_entities_step2"/>
+        <error to="Kill"/>
+    </action>
+    
+    <action name="join_entities_step2">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>JoinStep2</name>
+            <class>eu.dnetlib.dhp.broker.oa.JoinStep2Job</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="join_entities_step3"/>
+        <error to="Kill"/>
+    </action>
+    
+    <action name="join_entities_step3">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>JoinStep3</name>
+            <class>eu.dnetlib.dhp.broker.oa.JoinStep3Job</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="join_entities_step4"/>
+        <error to="Kill"/>
+    </action>
+    
+    <action name="join_entities_step4">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>JoinStep4</name>
+            <class>eu.dnetlib.dhp.broker.oa.JoinStep4Job</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="prepare_groups"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_groups">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PrepareGroupsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="generate_events"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="generate_events">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>GenerateEventsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+			<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
+			<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
+			<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
+        </spark>
+        <ok to="index_es"/>
+        <error to="Kill"/>
+    </action>
    
     <action name="index_es">
        <spark xmlns="uri:oozie:spark-action:0.2">
@ -91,10 +262,34 @@
            <arg>--index</arg><arg>${esIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
        </spark>
+        <ok to="stats"/>
+        <error to="Kill"/>
+       </action>
+    	
+    	<action name="stats">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>GenerateStatsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.GenerateStatsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
-    	
+
    <end name="End"/>

 </workflow-app>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java
@ -18,7 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.Field;

 public class DatePicker {

-	private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
+	public static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
 	private static final String DATE_DEFAULT_SUFFIX = "01-01";
 	private static final int YEAR_LB = 1300;
 	private static final int YEAR_UB = Year.now().getValue() + 5;
@ -114,7 +114,7 @@ public class DatePicker {
 		}
 	}

-	private static boolean inRange(final String date) {
+	public static boolean inRange(final String date) {
 		final int year = Integer.parseInt(substringBefore(date, "-"));
 		return year >= YEAR_LB && year <= YEAR_UB;
 	}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -1,11 +1,13 @@

 package eu.dnetlib.dhp.oa.dedup;

-import java.io.Serializable;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import eu.dnetlib.dhp.schema.common.EntityType;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.lang.StringUtils;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Dataset;
@ -13,15 +15,12 @@ import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Lists;
-
-import eu.dnetlib.dhp.schema.common.ModelSupport;
-import eu.dnetlib.dhp.schema.oaf.*;
 import scala.Tuple2;

+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
 public class DedupRecordFactory {

 	private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class);
@ -81,11 +80,16 @@ public class DedupRecordFactory {

 		final Collection<String> dates = Lists.newArrayList();
 		final List<List<Author>> authors = Lists.newArrayList();
+		final List<Identifier> bestPids = Lists.newArrayList();  //best pids list

 		entities
 			.forEachRemaining(
 				t -> {
 					T duplicate = t._2();
+
+					//prepare the list of pids to use for the id generation
+					bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
+
 					entity.mergeFrom(duplicate);
 					if (ModelSupport.isSubClass(duplicate, Result.class)) {
 						Result r1 = (Result) duplicate;
@ -94,6 +98,7 @@ public class DedupRecordFactory {
 						if (r1.getDateofacceptance() != null)
 							dates.add(r1.getDateofacceptance().getValue());
 					}
+
 				});

 		// set authors and date
@ -102,10 +107,13 @@ public class DedupRecordFactory {
 			((Result) entity).setAuthor(AuthorMerger.merge(authors));
 		}

-		entity.setId(id);
+		entity.setId(IdGenerator.generate(bestPids, id));
+
 		entity.setLastupdatetimestamp(ts);
 		entity.setDataInfo(dataInfo);

 		return entity;
 	}
+
+
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java
@ -0,0 +1,90 @@
+package eu.dnetlib.dhp.oa.dedup;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.dhp.schema.common.EntityType;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import org.apache.commons.lang.NullArgumentException;
+import org.apache.commons.lang.StringUtils;
+
+import java.io.Serializable;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+public class IdGenerator implements Serializable {
+
+    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+    public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
+    public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
+
+    //pick the best pid from the list (consider date and pidtype)
+    public static String generate(List<Identifier> pids, String defaultID) {
+        if (pids == null || pids.size() == 0)
+            return defaultID;
+
+        Optional<Identifier> bp = pids.stream()
+                .max(Identifier::compareTo);
+
+        if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
+            return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" + DedupUtility.md5(bp.get().getOriginalID());
+        } else {
+            return bp.get().getOriginalID().split("\\|")[0] + "|" + createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" + DedupUtility.md5(bp.get().getPid().getValue());
+        }
+
+    }
+
+    //pick the best pid from the entity. Returns a list (length 1) to save time in the call
+    public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
+
+        if (entity.getPid() == null || entity.getPid().size() == 0)
+            return Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()));
+
+        Optional<StructuredProperty> bp = entity.getPid().stream()
+                .filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
+                .max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
+
+        return bp.map(structuredProperty ->
+                Lists.newArrayList(new Identifier(structuredProperty, extractDate(entity, sdf), PidType.classidValueOf(structuredProperty.getQualifier().getClassid()), entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()))
+        ).orElseGet(() -> Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
+
+    }
+
+    //create the prefix (length = 12): dedup_+ pidType
+    public static String createPrefix(String pidType) {
+
+        StringBuilder prefix = new StringBuilder("dedup_" + pidType);
+
+        while (prefix.length() < 12) {
+            prefix.append("_");
+        }
+        return prefix.toString().substring(0, 12);
+
+    }
+
+    //extracts the date from the record. If the date is not available or is not wellformed, it returns a base date: 00-01-01
+    public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf){
+
+        String date = "2000-01-01";
+        if (ModelSupport.isSubClass(duplicate, Result.class)) {
+            Result result = (Result) duplicate;
+            if (isWellformed(result.getDateofacceptance())){
+                date = result.getDateofacceptance().getValue();
+            }
+        }
+
+        try {
+            return sdf.parse(date);
+        } catch (ParseException e) {
+            return new Date();
+        }
+
+    }
+
+    public static boolean isWellformed(Field<String> date) {
+        return date != null && StringUtils.isNotBlank(date.getValue()) && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
+    }
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Identifier.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Identifier.java
@ -0,0 +1,132 @@
+package eu.dnetlib.dhp.oa.dedup;
+
+import eu.dnetlib.dhp.schema.common.EntityType;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+import java.io.Serializable;
+import java.util.Date;
+import java.util.List;
+
+public class Identifier implements Serializable, Comparable<Identifier>{
+
+    StructuredProperty pid;
+    Date date;
+    PidType type;
+    List<KeyValue> collectedFrom;
+    EntityType entityType;
+    String originalID;
+
+    boolean useOriginal = false;  //to know if the top identifier won because of the alphabetical order of the original ID
+
+    public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom, EntityType entityType, String originalID) {
+        this.pid = pid;
+        this.date = date;
+        this.type = type;
+        this.collectedFrom = collectedFrom;
+        this.entityType = entityType;
+        this.originalID = originalID;
+    }
+
+    public StructuredProperty getPid() {
+        return pid;
+    }
+
+    public void setPid(StructuredProperty pidValue) {
+        this.pid = pid;
+    }
+
+    public Date getDate() {
+        return date;
+    }
+
+    public void setDate(Date date) {
+        this.date = date;
+    }
+
+    public PidType getType() {
+        return type;
+    }
+
+    public void setType(PidType type) {
+        this.type = type;
+    }
+
+    public List<KeyValue> getCollectedFrom() {
+        return collectedFrom;
+    }
+
+    public void setCollectedFrom(List<KeyValue> collectedFrom) {
+        this.collectedFrom = collectedFrom;
+    }
+
+    public EntityType getEntityType() {
+        return entityType;
+    }
+
+    public void setEntityType(EntityType entityType) {
+        this.entityType = entityType;
+    }
+
+    public String getOriginalID() {
+        return originalID;
+    }
+
+    public void setOriginalID(String originalID) {
+        this.originalID = originalID;
+    }
+
+    public boolean isUseOriginal() {
+        return useOriginal;
+    }
+
+    public void setUseOriginal(boolean useOriginal) {
+        this.useOriginal = useOriginal;
+    }
+
+    @Override
+    public int compareTo(Identifier i) {
+        //priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) alphabetical order of the originalID
+        if (this.getType().compareTo(i.getType()) == 0){ //same type
+            if (entityType == EntityType.publication) {
+                if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
+                    return 1;
+                if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
+                    return -1;
+            }
+            if (entityType == EntityType.dataset) {
+                if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
+                    return 1;
+                if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
+                    return -1;
+            }
+
+            if (this.getDate().compareTo(date) == 0) {//same date
+
+                if (this.originalID.compareTo(i.originalID) > 0)
+                    this.useOriginal = true;
+                else
+                    i.setUseOriginal(true);
+
+                //the minus because we need to take the alphabetically lower id
+                return -this.originalID.compareTo(i.originalID);
+            }
+            else
+                //the minus is because we need to take the elder date
+                return -this.getDate().compareTo(date);
+        }
+        else {
+            return this.getType().compareTo(i.getType());
+        }
+
+    }
+
+    public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId){
+
+        for(KeyValue cf: collectedFrom) {
+            if(cf.getKey().equals(dsId))
+                return true;
+        }
+        return false;
+    }
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/PidType.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/PidType.java
@ -0,0 +1,25 @@
+package eu.dnetlib.dhp.oa.dedup;
+
+public enum PidType {
+
+    //from the less to the more important
+    undefined,
+    original,
+    orcid,
+    ror,
+    grid,
+    pdb,
+    arXiv,
+    pmid,
+    doi;
+
+    public static PidType classidValueOf(String s){
+        try {
+            return PidType.valueOf(s);
+        }
+        catch (Exception e) {
+            return PidType.undefined;
+        }
+    }
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -22,10 +22,13 @@ public class EntityMergerTest implements Serializable {

 	List<Tuple2<String, Publication>> publications;
 	List<Tuple2<String, Publication>> publications2;
+	List<Tuple2<String, Publication>> publications3;
+	List<Tuple2<String, Publication>> publications4;
+	List<Tuple2<String, Publication>> publications5;

 	String testEntityBasePath;
 	DataInfo dataInfo;
-	String dedupId = "dedup_id";
+	String dedupId = "00|dedup_id::1";
 	Publication pub_top;

 	@BeforeEach
@ -38,6 +41,9 @@ public class EntityMergerTest implements Serializable {

 		publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
 		publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
+		publications3 = readSample(testEntityBasePath + "/publication_merge3.json", Publication.class);
+		publications4 = readSample(testEntityBasePath + "/publication_merge4.json", Publication.class);
+		publications5 = readSample(testEntityBasePath + "/publication_merge5.json", Publication.class);

 		pub_top = getTopPub(publications);

@ -54,6 +60,9 @@ public class EntityMergerTest implements Serializable {
 			.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);

 		assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
+
+		assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
+
 	}

 	@Test
@ -62,7 +71,8 @@ public class EntityMergerTest implements Serializable {
 		Publication pub_merged = DedupRecordFactory
 			.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);

-		assertEquals(dedupId, pub_merged.getId());
+		// verify id
+		assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");

 		assertEquals(pub_merged.getJournal(), pub_top.getJournal());
 		assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
@ -117,8 +127,43 @@ public class EntityMergerTest implements Serializable {
 		Publication pub_merged = DedupRecordFactory
 			.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);

+		// verify id
+		assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
+
 		assertEquals(pub_merged.getAuthor().size(), 27);
-		// insert assertions here
+
+	}
+
+	@Test
+	public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
+
+		Publication pub_merged = DedupRecordFactory
+				.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
+
+		// verify id
+		assertEquals( "50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
+
+	}
+
+	@Test
+	public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
+
+		Publication pub_merged = DedupRecordFactory
+				.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
+
+		// verify id
+		assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
+
+	}
+
+	@Test
+	public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
+
+		Publication pub_merged = DedupRecordFactory
+				.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
+
+		// verify id
+		assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());

 	}

--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
@ -1,22 +1,12 @@

 package eu.dnetlib.dhp.oa.dedup;

-import static java.nio.file.Files.createTempDirectory;
-
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.count;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.mockito.Mockito.lenient;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Serializable;
-import java.net.URISyntaxException;
-import java.nio.file.Paths;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.stream.Collectors;
-
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -35,16 +25,19 @@ import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.Mockito;
 import org.mockito.junit.jupiter.MockitoExtension;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
-import eu.dnetlib.pace.util.MapDocumentUtil;
 import scala.Tuple2;

+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URISyntaxException;
+import java.nio.file.Paths;
+
+import static java.nio.file.Files.createTempDirectory;
+import static org.apache.spark.sql.functions.count;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.mockito.Mockito.lenient;
+
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
 public class SparkDedupTest implements Serializable {
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge3.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge3.json
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge4.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge4.json
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge5.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge5.json
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -9,37 +9,6 @@

    <artifactId>dhp-graph-mapper</artifactId>

-    <build>
-        <plugins>
-            <plugin>
-                <groupId>net.alchim31.maven</groupId>
-                <artifactId>scala-maven-plugin</artifactId>
-                <version>4.0.1</version>
-                <executions>
-                    <execution>
-                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
-                        <goals>
-                            <goal>add-source</goal>
-                            <goal>compile</goal>
-                        </goals>
-                    </execution>
-                    <execution>
-                        <id>scala-test-compile</id>
-                        <phase>process-test-resources</phase>
-                        <goals>
-                            <goal>testCompile</goal>
-                        </goals>
-                    </execution>
-                </executions>
-                <configuration>
-                    <scalaVersion>${scala.version}</scalaVersion>
-                </configuration>
-            </plugin>
-        </plugins>
-
-    </build>
-
    <dependencies>

        <dependency>
@ -92,13 +61,6 @@
            <groupId>org.postgresql</groupId>
            <artifactId>postgresql</artifactId>
        </dependency>
-        <dependency>
-            <groupId>org.json4s</groupId>
-            <artifactId>json4s-jackson_2.11</artifactId>
-            <version>3.5.3</version>
-        </dependency>
-
-

    </dependencies>

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
@ -9,7 +9,6 @@ import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
@ -43,12 +42,6 @@ public class GraphHiveTableImporterJob {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		int numPartitions = Optional
-			.ofNullable(parser.get("numPartitions"))
-			.map(Integer::valueOf)
-			.orElse(-1);
-		log.info("numPartitions: {}", numPartitions);
-
 		String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);

@ -67,21 +60,16 @@ public class GraphHiveTableImporterJob {
 		conf.set("hive.metastore.uris", hiveMetastoreUris);

 		runWithSparkHiveSession(
-			conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz, numPartitions));
+			conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz));
 	}

 	// protected for testing
 	private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
-		Class<T> clazz, int numPartitions) {
+		Class<T> clazz) {

-		Dataset<String> dataset = spark.read().textFile(inputPath);
-
-		if (numPartitions > 0) {
-			log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
-			dataset = dataset.repartition(numPartitions);
-		}
-
-		dataset
+		spark
+			.read()
+			.textFile(inputPath)
 			.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java
@ -1,162 +0,0 @@
-
-package eu.dnetlib.dhp.oa.graph.merge;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
-import java.util.Objects;
-import java.util.Optional;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.FilterFunction;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
-import eu.dnetlib.dhp.schema.common.ModelSupport;
-import eu.dnetlib.dhp.schema.oaf.*;
-import scala.Tuple2;
-
-/**
- * Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids
- * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
- * by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
- */
-public class MergeGraphSparkJob {
-
-	private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
-
-	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-
-	private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD
-
-	public static void main(String[] args) throws Exception {
-
-		String jsonConfiguration = IOUtils
-			.toString(
-				CleanGraphSparkJob.class
-					.getResourceAsStream(
-						"/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json"));
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
-		parser.parseArgument(args);
-
-		String priority = Optional
-			.ofNullable(parser.get("priority"))
-			.orElse(PRIORITY_DEFAULT);
-		log.info("priority: {}", priority);
-
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-
-		String betaInputPath = parser.get("betaInputPath");
-		log.info("betaInputPath: {}", betaInputPath);
-
-		String prodInputPath = parser.get("prodInputPath");
-		log.info("prodInputPath: {}", prodInputPath);
-
-		String outputPath = parser.get("outputPath");
-		log.info("outputPath: {}", outputPath);
-
-		String graphTableClassName = parser.get("graphTableClassName");
-		log.info("graphTableClassName: {}", graphTableClassName);
-
-		Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
-
-		SparkConf conf = new SparkConf();
-		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
-		conf.registerKryoClasses(ModelSupport.getOafModelClasses());
-
-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				removeOutputDir(spark, outputPath);
-				mergeGraphTable(spark, priority, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath);
-			});
-	}
-
-	private static <P extends Oaf, B extends Oaf> void mergeGraphTable(
-		SparkSession spark,
-		String priority,
-		String betaInputPath,
-		String prodInputPath,
-		Class<P> p_clazz,
-		Class<B> b_clazz,
-		String outputPath) {
-
-		Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz);
-		Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz);
-
-		prod
-			.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer")
-			.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
-				Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
-				Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
-				switch (priority) {
-					default:
-					case "BETA":
-						return mergeWithPriorityToBETA(p, b);
-					case "PROD":
-						return mergeWithPriorityToPROD(p, b);
-				}
-			}, Encoders.bean(p_clazz))
-			.filter((FilterFunction<P>) Objects::nonNull)
-			.write()
-			.mode(SaveMode.Overwrite)
-			.option("compression", "gzip")
-			.json(outputPath);
-	}
-
-	private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
-		if (b.isPresent() & !p.isPresent()) {
-			return (P) b.get();
-		}
-		if (p.isPresent()) {
-			return p.get();
-		}
-		return null;
-	}
-
-	private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToBETA(Optional<P> p, Optional<B> b) {
-		if (p.isPresent() & !b.isPresent()) {
-			return p.get();
-		}
-		if (b.isPresent()) {
-			return (P) b.get();
-		}
-		return null;
-	}
-
-	private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath(
-		SparkSession spark, String inputEntityPath, Class<T> clazz) {
-
-		log.info("Reading Graph table from: {}", inputEntityPath);
-		return spark
-			.read()
-			.textFile(inputEntityPath)
-			.map(
-				(MapFunction<String, Tuple2<String, T>>) value -> {
-					final T t = OBJECT_MAPPER.readValue(value, clazz);
-					final String id = ModelSupport.idFn().apply(t);
-					return new Tuple2<>(id, t);
-				},
-				Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
-	}
-
-	private static void removeOutputDir(SparkSession spark, String path) {
-		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
-	}
-
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -1,10 +1,36 @@

 package eu.dnetlib.dhp.oa.graph.raw;

-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.keyValue;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;

-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;

 import org.apache.commons.lang3.StringUtils;
 import org.dom4j.Document;
@ -14,8 +40,24 @@ import org.dom4j.Node;

 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.LicenseComparator;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Context;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.GeoLocation;
+import eu.dnetlib.dhp.schema.oaf.Instance;
+import eu.dnetlib.dhp.schema.oaf.Journal;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.Software;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;

 public abstract class AbstractMdRecordToOafMapper {

@ -57,6 +99,7 @@ public abstract class AbstractMdRecordToOafMapper {
 			final Document doc = DocumentHelper
 				.parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));

+			final String type = doc.valueOf("//dr:CobjCategory/@type");
 			final KeyValue collectedFrom = getProvenanceDatasource(
 				doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");

@ -75,39 +118,12 @@ public abstract class AbstractMdRecordToOafMapper {
 			final DataInfo info = prepareDataInfo(doc, invisible);
 			final long lastUpdateTimestamp = new Date().getTime();

-			final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
-
-			final String type = getResultType(doc, instances);
-
-			return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
+			return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
 		} catch (final Exception e) {
 			throw new RuntimeException(e);
 		}
 	}

-	protected String getResultType(final Document doc, final List<Instance> instances) {
-		String type = doc.valueOf("//dr:CobjCategory/@type");
-
-		if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
-			String instanceType = instances
-				.stream()
-				.map(i -> i.getInstancetype().getClassid())
-				.findFirst()
-				.map(s -> UNKNOWN.equalsIgnoreCase(s) ? "0000" : s)
-				.orElse("0000"); // Unknown
-			return Optional
-				.ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
-				.map(q -> q.getClassid())
-				.orElse("0000");
-			/*
-			 * .orElseThrow( () -> new IllegalArgumentException( String.format("'%s' not mapped in %s", instanceType,
-			 * DNET_RESULT_TYPOLOGIES)));
-			 */
-		}
-
-		return type;
-	}
-
 	private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
 		final String dsId = doc.valueOf(xpathId);
 		final String dsName = doc.valueOf(xpathName);
@ -122,8 +138,8 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected List<Oaf> createOafs(
 		final Document doc,
 		final String type,
-		final List<Instance> instances,
 		final KeyValue collectedFrom,
+		final KeyValue hostedBy,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {

@ -132,14 +148,14 @@ public abstract class AbstractMdRecordToOafMapper {
 		switch (type.toLowerCase()) {
 			case "publication":
 				final Publication p = new Publication();
-				populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
 				p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
 				p.setJournal(prepareJournal(doc, info));
 				oafs.add(p);
 				break;
 			case "dataset":
 				final Dataset d = new Dataset();
-				populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
 				d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
 				d.setStoragedate(prepareDatasetStorageDate(doc, info));
 				d.setDevice(prepareDatasetDevice(doc, info));
@ -152,7 +168,7 @@ public abstract class AbstractMdRecordToOafMapper {
 				break;
 			case "software":
 				final Software s = new Software();
-				populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
 				s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
 				s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
 				s.setLicense(prepareSoftwareLicenses(doc, info));
@ -164,7 +180,7 @@ public abstract class AbstractMdRecordToOafMapper {
 			case "otherresearchproducts":
 			default:
 				final OtherResearchProduct o = new OtherResearchProduct();
-				populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
 				o.setResulttype(ORP_DEFAULT_RESULTTYPE);
 				o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
 				o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
@ -243,16 +259,14 @@ public abstract class AbstractMdRecordToOafMapper {
 	private void populateResultFields(
 		final Result r,
 		final Document doc,
-		final List<Instance> instances,
 		final KeyValue collectedFrom,
+		final KeyValue hostedBy,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
 		r.setDataInfo(info);
 		r.setLastupdatetimestamp(lastUpdateTimestamp);
 		r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
-
-		r.setOriginalId(Arrays.asList(findOriginalId(doc)));
-
+		r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
 		r.setCollectedfrom(Arrays.asList(collectedFrom));
 		r.setPid(prepareResultPids(doc, info));
 		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
@ -277,7 +291,7 @@ public abstract class AbstractMdRecordToOafMapper {
 		r.setCoverage(prepareCoverages(doc, info));
 		r.setContext(prepareContexts(doc, info));
 		r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
-
+		final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
 		r.setInstance(instances);
 		r.setBestaccessright(getBestAccessRights(instances));
 	}
@ -415,18 +429,6 @@ public abstract class AbstractMdRecordToOafMapper {
 		return null;
 	}

-	private String findOriginalId(final Document doc) {
-		final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
-		if (n != null) {
-			final String id = n.valueOf("./*[local-name()='identifier']");
-			if (StringUtils.isNotBlank(id)) {
-				return id;
-			}
-		}
-		return doc.valueOf("//*[local-name()='header']/*[local-name()='identifier']");
-
-	}
-
 	protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
 		return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
@ -4,11 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Map;
 import java.util.Objects;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.function.Function;
-import java.util.function.Predicate;
 import java.util.stream.Collectors;

 import org.apache.commons.lang3.StringUtils;
@ -61,7 +57,6 @@ public class OafMapperUtils {
 			.stream(values)
 			.map(v -> field(v, info))
 			.filter(Objects::nonNull)
-			.filter(distinctByKey(f -> f.getValue()))
 			.collect(Collectors.toList());
 	}

@ -70,7 +65,6 @@ public class OafMapperUtils {
 			.stream()
 			.map(v -> field(v, info))
 			.filter(Objects::nonNull)
-			.filter(distinctByKey(f -> f.getValue()))
 			.collect(Collectors.toList());
 	}

@ -243,10 +237,4 @@ public class OafMapperUtils {
 	public static String asString(final Object o) {
 		return o == null ? "" : o.toString();
 	}
-
-	public static <T> Predicate<T> distinctByKey(
-		final Function<? super T, ?> keyExtractor) {
-		final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
-		return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
-	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
@ -1,89 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
-import org.apache.spark.sql.{Encoder, Encoders}
-import org.apache.spark.sql.expressions.Aggregator
-
-
-
-object EBIAggregator {
-
-  def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
-
-    override def zero: OafDataset = new OafDataset()
-
-    override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
-      b.mergeFrom(a._2)
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-
-    override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
-      wx.mergeFrom(wy)
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: OafDataset): OafDataset = reduction
-
-    override def bufferEncoder: Encoder[OafDataset] =
-      Encoders.kryo(classOf[OafDataset])
-
-    override def outputEncoder: Encoder[OafDataset] =
-      Encoders.kryo(classOf[OafDataset])
-  }
-
-
-  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
-
-    override def zero: Publication = new Publication()
-
-    override def reduce(b: Publication, a: (String, Publication)): Publication = {
-      b.mergeFrom(a._2)
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-
-    override def merge(wx: Publication, wy: Publication): Publication = {
-      wx.mergeFrom(wy)
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: Publication): Publication = reduction
-
-    override def bufferEncoder: Encoder[Publication] =
-      Encoders.kryo(classOf[Publication])
-
-    override def outputEncoder: Encoder[Publication] =
-      Encoders.kryo(classOf[Publication])
-  }
-
-
-  def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
-
-    override def zero: Relation = new Relation()
-
-    override def reduce(b: Relation, a: (String, Relation)): Relation = {
-      a._2
-    }
-
-
-    override def merge(a: Relation, b: Relation): Relation = {
-      if(b!= null) b else a
-    }
-    override def finish(reduction: Relation): Relation = reduction
-
-    override def bufferEncoder: Encoder[Relation] =
-      Encoders.kryo(classOf[Relation])
-
-    override def outputEncoder: Encoder[Relation] =
-      Encoders.kryo(classOf[Relation])
-  }
-
-
-
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
@ -1,138 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf}
-import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
-import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo}
-import eu.dnetlib.dhp.utils.DHPUtils
-import eu.dnetlib.scholexplorer.relation.RelationMapper
-import org.apache.commons.io.IOUtils
-import org.apache.spark.SparkConf
-import org.apache.spark.sql._
-import org.json4s
-import org.json4s.DefaultFormats
-import org.json4s.JsonAST.{JField, JObject, JString}
-import org.json4s.jackson.JsonMethods.parse
-
-import scala.collection.JavaConverters._
-
-object SparkAddLinkUpdates {
-
-  val relationMapper = RelationMapper.load
-
-
-case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
-
-
-  def generatePubmedDLICollectedFrom(): KeyValue = {
-    OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
-  }
-
-
-  def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
-    val pmid :String = input._1
-    val input_json :String = input._2
-    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
-    lazy val json: json4s.JValue = parse(input_json)
-
-
-    val targets:List[EBILinks] = for {
-      JObject(link) <- json \\ "Category" \\ "Link"
-      JField("PublicationDate", JString(pubdate)) <- link
-      JField("RelationshipType", JObject(relationshipType)) <- link
-      JField("Name", JString(relname)) <- relationshipType
-      JField("Target", JObject(target)) <- link
-      JField("Identifier", JObject(identifier)) <- target
-      JField("ID", JString(tpid)) <- identifier
-      JField("IDScheme", JString(tpidtype)) <- identifier
-      JField("IDURL", JString(turl)) <- identifier
-      JField("Title", JString(title)) <- target
-      JField("Publisher", JObject(pub)) <- target
-      JField("Name", JString(publisher)) <- pub
-    } yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
-
-
-
-    val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
-
-    targets.flatMap(l => {
-      val relation = new DLIRelation
-      val inverseRelation = new DLIRelation
-      val targetDnetId =  s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
-      val relInfo = relationMapper.get(l.relation.toLowerCase)
-      val relationSemantic = relInfo.getOriginal
-      val inverseRelationSemantic = relInfo.getInverse
-
-      relation.setSource(dnetPublicationId)
-      relation.setTarget(targetDnetId)
-      relation.setRelClass("datacite")
-      relation.setRelType(relationSemantic)
-      relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-
-      inverseRelation.setSource(targetDnetId)
-      inverseRelation.setTarget(dnetPublicationId)
-      inverseRelation.setRelClass("datacite")
-      inverseRelation.setRelType(inverseRelationSemantic)
-      inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-
-
-
-      val d = new DLIDataset
-      d.setId(targetDnetId)
-      d.setDataInfo(OafUtils.generateDataInfo())
-      d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava)
-      d.setCompletionStatus("complete")
-      val pi = new ProvenaceInfo
-      pi.setId("dli_________::europe_pmc__")
-      pi.setName( "Europe PMC")
-      pi.setCompletionStatus("complete")
-      pi.setCollectionMode("collected")
-      d.setDlicollectedfrom(List(pi).asJava)
-      d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
-      d.setPublisher(OafUtils.asField(l.publisher))
-      d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava)
-      d.setDateofacceptance(OafUtils.asField(l.pubdate))
-      val i = new Instance
-      i.setCollectedfrom(generatePubmedDLICollectedFrom())
-      i.setDateofacceptance(d.getDateofacceptance)
-      i.setUrl(List(l.turl).asJava)
-      i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
-      d.setInstance(List(i).asJava)
-      List(relation, inverseRelation, d)
-    })
-  }
-
-
-  def main(args: Array[String]): Unit = {
-    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
-    parser.parseArgument(args)
-    val spark: SparkSession =
-      SparkSession
-        .builder()
-        .config(conf)
-        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
-        .master(parser.get("master")).getOrCreate()
-
-
-    val workingPath = parser.get("workingPath")
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
-    implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation])
-    implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
-
-    val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
-
-    ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
-
-    ds.filter(s => s.isInstanceOf)
-
-
-
-    val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
-
-    oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
-    oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
-
-
-
-  }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala
@ -1,49 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import org.apache.commons.io.IOUtils
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
-
-
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
-
-object SparkCreateBaselineDataFrame {
-
-
-  def main(args: Array[String]): Unit = {
-    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
-    parser.parseArgument(args)
-    val spark: SparkSession =
-      SparkSession
-        .builder()
-        .config(conf)
-        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
-        .master(parser.get("master")).getOrCreate()
-
-    val sc = spark.sparkContext
-
-    val workingPath = parser.get("workingPath")
-
-    implicit  val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
-    implicit  val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
-    implicit  val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
-    val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
-
-    val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
-      val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
-      new PMParser(xml)
-
-    } ))
-
-    ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
-
-
-
-
-  }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
@ -1,87 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
-import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
-import eu.dnetlib.scholexplorer.relation.RelationMapper
-import org.apache.commons.io.IOUtils
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
-import org.slf4j.{Logger, LoggerFactory}
-import scala.collection.JavaConverters._
-
-object SparkCreateEBIDataFrame {
-
-
-  def main(args: Array[String]): Unit = {
-    val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
-    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
-    parser.parseArgument(args)
-    val spark: SparkSession =
-      SparkSession
-        .builder()
-        .config(conf)
-        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
-        .master(parser.get("master")).getOrCreate()
-
-    val sc = spark.sparkContext
-
-
-    val workingPath = parser.get("workingPath")
-    val relationMapper = RelationMapper.load
-
-    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
-    implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
-    implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
-    implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
-
-    logger.info("Extract Publication and relation from publication_xml")
-    val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
-    {
-      new ObjectMapper().readValue(s, classOf[String])
-    }).flatMap(s => {
-      val d = new PublicationScholexplorerParser
-      d.parseObject(s, relationMapper).asScala.iterator})
-
-    val mapper = new ObjectMapper()
-    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
-    spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
-
-    logger.info("Extract Publication and relation from dataset_xml")
-    val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
-    {
-      new ObjectMapper().readValue(s, classOf[String])
-    }).flatMap(s => {
-      val d = new DatasetScholexplorerParser
-      d.parseObject(s, relationMapper).asScala.iterator})
-
-    spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
-    val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset])
-    val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication])
-    val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
-    publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getPublicationAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
-
-    dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getDatasetAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
-
-    relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(EBIAggregator.getRelationAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
-
-
-
-    relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
-  }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java
@ -1,64 +0,0 @@
-
-package eu.dnetlib.dhp.sx.ebi.model;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-
-public class PMArticle implements Serializable {
-
-	private String pmid;
-	private String date;
-	private PMJournal journal;
-	private String title;
-	private String description;
-	private List<PMAuthor> authors = new ArrayList<>();
-
-	public String getPmid() {
-		return pmid;
-	}
-
-	public void setPmid(String pmid) {
-		this.pmid = pmid;
-	}
-
-	public String getDate() {
-		return date;
-	}
-
-	public void setDate(String date) {
-		this.date = date;
-	}
-
-	public PMJournal getJournal() {
-		return journal;
-	}
-
-	public void setJournal(PMJournal journal) {
-		this.journal = journal;
-	}
-
-	public String getTitle() {
-		return title;
-	}
-
-	public void setTitle(String title) {
-		this.title = title;
-	}
-
-	public String getDescription() {
-		return description;
-	}
-
-	public void setDescription(String description) {
-		this.description = description;
-	}
-
-	public List<PMAuthor> getAuthors() {
-		return authors;
-	}
-
-	public void setAuthors(List<PMAuthor> authors) {
-		this.authors = authors;
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java
@ -1,31 +0,0 @@
-
-package eu.dnetlib.dhp.sx.ebi.model;
-
-import java.io.Serializable;
-
-public class PMAuthor implements Serializable {
-
-	private String lastName;
-	private String foreName;
-
-	public String getLastName() {
-		return lastName;
-	}
-
-	public void setLastName(String lastName) {
-		this.lastName = lastName;
-	}
-
-	public String getForeName() {
-		return foreName;
-	}
-
-	public void setForeName(String foreName) {
-		this.foreName = foreName;
-	}
-
-	public String getFullName() {
-		return String.format("%s, %s", this.foreName, this.lastName);
-	}
-
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java
@ -1,53 +0,0 @@
-
-package eu.dnetlib.dhp.sx.ebi.model;
-
-import java.io.Serializable;
-
-public class PMJournal implements Serializable {
-
-	private String issn;
-	private String volume;
-	private String issue;
-	private String date;
-	private String title;
-
-	public String getIssn() {
-		return issn;
-	}
-
-	public void setIssn(String issn) {
-		this.issn = issn;
-	}
-
-	public String getVolume() {
-		return volume;
-	}
-
-	public void setVolume(String volume) {
-		this.volume = volume;
-	}
-
-	public String getIssue() {
-		return issue;
-	}
-
-	public void setIssue(String issue) {
-		this.issue = issue;
-	}
-
-	public String getDate() {
-		return date;
-	}
-
-	public void setDate(String date) {
-		this.date = date;
-	}
-
-	public String getTitle() {
-		return title;
-	}
-
-	public void setTitle(String title) {
-		this.title = title;
-	}
-}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala
@ -1,92 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi.model
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
-class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
-
-  var currentArticle:PMArticle = generateNextArticle()
-
-  override def hasNext: Boolean = currentArticle!= null
-
-  override def next(): PMArticle = {
-    val tmp = currentArticle
-    currentArticle = generateNextArticle()
-    tmp
-  }
-
-
-  def generateNextArticle():PMArticle = {
-
-    var currentAuthor: PMAuthor = null
-    var currentJournal: PMJournal = null
-    var currNode: String = null
-    var currentYear = "0"
-    var currentMonth = "01"
-    var currentDay = "01"
-
-    while (xml.hasNext) {
-      xml.next match {
-        case EvElemStart(_, label, _, _) =>
-          currNode = label
-          label match {
-            case "PubmedArticle" => currentArticle = new PMArticle
-            case "Author" => currentAuthor = new PMAuthor
-            case "Journal" => currentJournal = new PMJournal
-            case _ =>
-          }
-        case EvElemEnd(_, label) =>
-          label match {
-            case "PubmedArticle" => return currentArticle
-            case "Author" => currentArticle.getAuthors.add(currentAuthor)
-            case "Journal" => currentArticle.setJournal(currentJournal)
-            case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay")
-            case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
-            case _ =>
-          }
-        case EvText(text) =>
-          if (currNode!= null && text.trim.nonEmpty)
-            currNode match {
-              case "ArticleTitle" => {
-                if (currentArticle.getTitle==null)
-                  currentArticle.setTitle(text.trim)
-                else
-                  currentArticle.setTitle(currentArticle.getTitle + text.trim)
-              }
-              case "AbstractText" => {
-                if (currentArticle.getDescription==null)
-                  currentArticle.setDescription(text.trim)
-                else
-                  currentArticle.setDescription(currentArticle.getDescription + text.trim)
-              }
-              case "PMID" => currentArticle.setPmid(text.trim)
-              case "ISSN" => currentJournal.setIssn(text.trim)
-              case "Year" => currentYear = text.trim
-              case "Month" => currentMonth = text.trim
-              case "Day" => currentDay = text.trim
-              case "Volume" => currentJournal.setVolume( text.trim)
-              case "Issue" => currentJournal.setIssue (text.trim)
-              case "LastName" => {
-                if (currentAuthor != null)
-                  currentAuthor.setLastName(text.trim)
-
-              }
-              case "ForeName" => if (currentAuthor != null)
-                currentAuthor.setForeName(text.trim)
-              case "Title" =>
-                if (currentJournal.getTitle==null)
-                  currentJournal.setTitle(text.trim)
-                else
-                  currentJournal.setTitle(currentJournal.getTitle + text.trim)
-              case _ =>
-
-            }
-        case _ =>
-      }
-
-    }
-    null
-  }
-}
-
-
-
-
-
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
@ -150,17 +150,6 @@ public abstract class AbstractScholexplorerParser {
 		return uk;
 	}

-	protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
-		final String schemeName) {
-		final Qualifier q = new Qualifier();
-		q.setClassid(classId);
-		q.setClassid(className);
-		q.setSchemeid(schemeId);
-		q.setSchemename(schemeName);
-		return q;
-
-	}
-
 	protected void generateRelations(
 		RelationMapper relationMapper,
 		Result parsedObject,
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
@ -64,6 +64,7 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 				currentDate.setQualifier(dateQualifier);
 				parsedObject.setRelevantdate(Collections.singletonList(currentDate));
 			}
+
 			final String completionStatus = VtdUtilityParser
 				.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
 			final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
@ -148,37 +149,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 			inferPid(currentPid);
 			parsedObject.setPid(Collections.singletonList(currentPid));

-			String resolvedURL = null;
-
-			switch (currentPid.getQualifier().getClassname().toLowerCase()) {
-				case "uniprot":
-					resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
-					break;
-				case "ena":
-					if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
-						resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
-					break;
-				case "chembl":
-					resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
-					break;
-
-				case "ncbi-n":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "ncbi-p":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "genbank":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "pdb":
-					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
-					break;
-				case "url":
-					resolvedURL = currentPid.getValue();
-					break;
-			}
-
 			final String sourceId = generateId(
 				currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
 			parsedObject.setId(sourceId);
@ -281,11 +251,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 								t -> {
 									final StructuredProperty st = new StructuredProperty();
 									st.setValue(t);
-									st
-										.setQualifier(
-											generateQualifier(
-												"main title", "main title", "dnet:dataCite_title",
-												"dnet:dataCite_title"));
 									return st;
 								})
 							.collect(Collectors.toList()));
@ -317,13 +282,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 							.collect(Collectors.toList()));
 			}

-			if (StringUtils.isNotBlank(resolvedURL)) {
-				Instance i = new Instance();
-				i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
-				i.setUrl(Collections.singletonList(resolvedURL));
-				parsedObject.setInstance(Collections.singletonList(i));
-			}
-
 			result.add(parsedObject);
 			return result;
 		} catch (Throwable e) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
@ -202,11 +202,6 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser
 								t -> {
 									final StructuredProperty st = new StructuredProperty();
 									st.setValue(t);
-									st
-										.setQualifier(
-											generateQualifier(
-												"main title", "main title", "dnet:dataCite_title",
-												"dnet:dataCite_title"));
 									return st;
 								})
 							.collect(Collectors.toList()));
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -282,7 +282,6 @@
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>100</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json
@ -5,12 +5,6 @@
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
-  {
-    "paramName": "np",
-    "paramLongName": "numPartitions",
-    "paramDescription": "number of dataset partitions",
-    "paramRequired": false
-  },
  {
    "paramName": "in",
    "paramLongName": "inputPath",
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml
@ -1,18 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
@ -1,293 +0,0 @@
-<workflow-app name="merge graphs" xmlns="uri:oozie:workflow:0.5">
-
-    <parameters>
-        <property>
-            <name>betaInputGgraphPath</name>
-            <description>the beta graph root path</description>
-        </property>
-        <property>
-            <name>prodInputGgraphPath</name>
-            <description>the production graph root path</description>
-        </property>
-        <property>
-            <name>graphOutputPath</name>
-            <description>the output merged graph root path</description>
-        </property>
-        <property>
-            <name>priority</name>
-            <description>decides from which infrastructure the content must win in case of ID clash</description>
-        </property>
-
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-        <property>
-            <name>oozieActionShareLibForSpark2</name>
-            <description>oozie action sharelib for spark 2.*</description>
-        </property>
-        <property>
-            <name>spark2ExtraListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
-            <description>spark 2.* extra listeners classname</description>
-        </property>
-        <property>
-            <name>spark2SqlQueryExecutionListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
-            <description>spark 2.* sql query execution listeners classname</description>
-        </property>
-        <property>
-            <name>spark2YarnHistoryServerAddress</name>
-            <description>spark 2.* yarn history server address</description>
-        </property>
-        <property>
-            <name>spark2EventLogDir</name>
-            <description>spark 2.* event log dir location</description>
-        </property>
-    </parameters>
-
-	<start to="fork_merge_graph"/>
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <fork name="fork_merge_graph">
-        <path start="merge_publication"/>
-        <path start="merge_dataset"/>
-        <path start="merge_otherresearchproduct"/>
-        <path start="merge_software"/>
-        <path start="merge_datasource"/>
-        <path start="merge_organization"/>
-        <path start="merge_project"/>
-        <path start="merge_relation"/>
-    </fork>
-
-    <action name="merge_publication">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge publications</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/publication</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/publication</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_dataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge datasets</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/dataset</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/dataset</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_otherresearchproduct">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge otherresearchproducts</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/otherresearchproduct</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/otherresearchproduct</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_software">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge softwares</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/software</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/software</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_datasource">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge datasources</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/datasource</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/datasource</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_organization">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge organizations</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/organization</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/organization</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_project">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge projects</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/project</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/project</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="merge_relation">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Merge relations</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/relation</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/relation</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
-            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
-            <arg>--priority</arg><arg>${priority}</arg>
-        </spark>
-        <ok to="wait_merge"/>
-        <error to="Kill"/>
-    </action>
-
-    <join name="wait_merge" to="End"/>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json
@ -1,38 +0,0 @@
-[
-	{
-		"paramName": "issm",
-		"paramLongName": "isSparkSessionManaged",
-		"paramDescription": "when true will stop SparkSession after job execution",
-		"paramRequired": false
-	},
-	{
-		"paramName": "bin",
-		"paramLongName": "betaInputPath",
-		"paramDescription": "the beta graph root path",
-		"paramRequired": true
-	},
-	{
-		"paramName": "pin",
-		"paramLongName": "prodInputPath",
-		"paramDescription": "the production graph root path",
-		"paramRequired": true
-	},
-	{
-		"paramName": "out",
-		"paramLongName": "outputPath",
-		"paramDescription": "the output merged graph root path",
-		"paramRequired": true
-	},
-	{
-		"paramName": "class",
-		"paramLongName": "graphTableClassName",
-		"paramDescription": "class name moelling the graph table",
-		"paramRequired": true
-	},
-	{
-		"paramName": "pr",
-		"paramLongName": "priority",
-		"paramDescription": "decides from which infrastructure the content must win in case of ID clash",
-		"paramRequired": false
-	}
-]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json
@ -1,4 +0,0 @@
-[
-  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
-  {"paramName":"w",   "paramLongName":"workingPath",         "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true}
-]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml
@ -1,68 +0,0 @@
-<configuration>
-
-    <!-- OCEAN  -->
-    <!--
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
-    </property>
-    -->
-
-    <!-- GARR  -->
-
-    <property>
-        <name>jobTracker</name>
-        <value>yarn</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
-    </property>
-
-
-    <property>
-        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-        <value>true</value>
-    </property>
-
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml
@ -1,97 +0,0 @@
-<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPath</name>
-            <description>the Working Path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-    </parameters>
-
-    <start to="GenerateUpdates"/>
-
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-
-    <action name="GenerateBaselineDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Baselnie DataSet</name>
-
-            <class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=1
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="GenerateUpdates">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Baselnie DataSet</name>
-
-            <class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=1
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="CreateEBIDataSet">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create EBI DataSet</name>
-
-            <class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=1000
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -5,7 +5,8 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.mockito.Mockito.lenient;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.Mockito.when;

 import java.io.IOException;
 import java.util.List;
@ -19,9 +20,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.junit.jupiter.MockitoExtension;

-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
+import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Author;
@ -32,25 +31,24 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.Software;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

@ExtendWith(MockitoExtension.class)
 public class MappersTest {

-	@Mock
-	private ISLookUpService isLookUpService;
-
 	@Mock
 	private VocabularyGroup vocs;

 	@BeforeEach
 	public void setUp() throws Exception {
-		lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
-		lenient()
-			.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
-			.thenReturn(synonyms());
+		when(vocs.getTermAsQualifier(anyString(), anyString()))
+			.thenAnswer(
+				invocation -> OafMapperUtils
+					.qualifier(
+						invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
+						invocation.getArgument(0)));
+
+		when(vocs.termExists(anyString(), anyString())).thenReturn(true);

-		vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
 	}

 	@Test
@ -70,14 +68,9 @@ public class MappersTest {
 		final Relation r2 = (Relation) list.get(2);

 		assertValidId(p.getId());
-
-		assertTrue(p.getOriginalId().size() == 1);
-		assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
-
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 		assertFalse(p.getDataInfo().getInvisible());
-		assertTrue(p.getSource().size() == 1);

 		assertTrue(p.getAuthor().size() > 0);
 		final Optional<Author> author = p
@ -86,7 +79,6 @@ public class MappersTest {
 			.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
 			.findFirst();
 		assertTrue(author.isPresent());
-
 		final StructuredProperty pid = author
 			.get()
 			.getPid()
@ -177,8 +169,6 @@ public class MappersTest {
 		final Relation r2 = (Relation) list.get(2);

 		assertValidId(d.getId());
-		assertTrue(d.getOriginalId().size() == 1);
-		assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
 		assertValidId(d.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
 		assertTrue(d.getAuthor().size() > 0);
@ -265,32 +255,10 @@ public class MappersTest {
 		assertTrue(s.getInstance().size() > 0);
 	}

-	// @Test
-	void testDataset_2() throws IOException {
-		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml"));
-
-		final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
-
-		System.out.println("***************");
-		System.out.println(new ObjectMapper().writeValueAsString(list));
-		System.out.println("***************");
-	}
-
 	private void assertValidId(final String id) {
 		assertEquals(49, id.length());
 		assertEquals('|', id.charAt(2));
 		assertEquals(':', id.charAt(15));
 		assertEquals(':', id.charAt(16));
 	}
-
-	private List<String> vocs() throws IOException {
-		return IOUtils
-			.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
-	}
-
-	private List<String> synonyms() throws IOException {
-		return IOUtils
-			.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
-	}
-
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala
@ -1,20 +0,0 @@
-package eu.dnetlib.dhp.sx.ebi
-
-import org.junit.jupiter.api.Test
-
-class TestEBI {
-
-
-
-//  @Test
-  def testEBIData() = {
-    SparkAddLinkUpdates.main("-mt local[*] -w /home/sandro/Downloads".split(" "))
-
-
-
-
-
-
-  }
-
-}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
@ -34,8 +34,6 @@
    <dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
    <dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
    <dc:source>One Ecosystem 2: e13718</dc:source>
-    <dc:source>One Ecosystem 2: e13718</dc:source>
-    <dc:source>One Ecosystem 2: e13718</dc:source>
    <dc:subject>Ecosystem Services hotspots</dc:subject>
    <dc:subject>Natura 2000</dc:subject>
    <dc:subject>Quiet Protected Areas</dc:subject>
@ -49,8 +47,7 @@
    <dc:subject>regulating services</dc:subject>
    <dc:subject>supporting services</dc:subject>
    <dc:type>Research Article</dc:type>
-    <!--<dr:CobjCategory type="publication">0001</dr:CobjCategory>-->
-    <dr:CobjCategory>0001</dr:CobjCategory>
+    <dr:CobjCategory type="publication">0001</dr:CobjCategory>
    <oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
    <oaf:projectid>corda_______::226852</oaf:projectid>
    <oaf:accessrights>OPEN</oaf:accessrights>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml
@ -82,8 +82,7 @@
 <p>All files are in MATLAB .mat format.</p></description>
      </descriptions>
    </resource>
-    <!--<dr:CobjCategory type="dataset">0021</dr:CobjCategory>-->
-    <dr:CobjCategory>0021</dr:CobjCategory>
+    <dr:CobjCategory type="dataset">0021</dr:CobjCategory>
    <oaf:dateAccepted>2019-01-01</oaf:dateAccepted>
    <oaf:accessrights>OPEN</oaf:accessrights>
    <oaf:language>und</oaf:language>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset_2.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset_2.xml
@ -1,75 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<oai:record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
-  xmlns:dri="http://www.driver-repository.eu/namespace/dri"
-  xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
-  <oai:header>
-    <dri:objIdentifier>opentrials__::0000bf8e63d3d7e6b88421eabafae3f6</dri:objIdentifier>
-    <dri:recordIdentifier>feabb67c-1fd1-423b-aec6-606d04ce53c6</dri:recordIdentifier>
-    <dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
-    <oaf:datasourceprefix>opentrials__</oaf:datasourceprefix>
-    <dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
-  </oai:header>
-  <oai:metadata>
-    <resource xmlns="http://datacite.org/schema/kernel-3"
-      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
-      <identifier identifierType="URL">https://clinicaltrials.gov/ct2/show/NCT02321059</identifier>
-      <alternateIdentifiers>
-        <alternateIdentifier alternateIdentifierType="URL">http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059</alternateIdentifier>
-        <alternateIdentifier alternateIdentifierType="nct">NCT02321059</alternateIdentifier>
-      </alternateIdentifiers>
-      <creators>
-        <creator>
-          <creatorName>Jensen, Kristian K</creatorName>
-        </creator>
-      </creators>
-      <titles>
-        <title>Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia</title>
-      </titles>
-      <publisher>nct</publisher>
-      <geoLocations>
-        <geoLocationPlace>Denmark</geoLocationPlace>
-      </geoLocations>
-      <resourceType resourceTypeGeneral="clinicalTrial">0037</resourceType>
-      <descriptions>
-        <description descriptionType="Abstract">Patients with an incisional hernia in the midline and controls with an intact abdominal wall are examined twice with one week apart, in order to establish the test-retest reliability and internal and external validity of the Goodstrength trunk dynamometer.</description>
-      </descriptions>
-    </resource>
-    <oaf:accessrights>OPEN</oaf:accessrights>
-    <dr:CobjCategory type="dataset">0037</dr:CobjCategory>
-    <oaf:dateAccepted>2014-11-11</oaf:dateAccepted>
-    <oaf:hostedBy id="openaire____::opentrials" name="OpenTrials"/>
-    <oaf:collectedFrom id="openaire____::opentrials" name="OpenTrials"/>
-    <oaf:about>
-      <oaf:datainfo>
-        <oaf:inferred>false</oaf:inferred>
-        <oaf:deletedbyinference>false</oaf:deletedbyinference>
-        <oaf:trust>0.9</oaf:trust>
-        <oaf:inferenceprovenance/>
-        <oaf:provenanceaction
-          classid="sysimport:crosswalk:datasetarchive"
-          classname="sysimport:crosswalk:datasetarchive"
-          schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
-      </oaf:datainfo>
-    </oaf:about>
-  </oai:metadata>
-  <about xmlns:dc="http://purl.org/dc/elements/1.1/"
-    xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-    <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
-      <originDescription altered="true" harvestDate="2019-03-27T15:15:22.22Z">
-        <baseURL>file:///var/lib/dnet/data/opentrials/opentrials.csv</baseURL>
-        <identifier/>
-        <datestamp/>
-        <metadataNamespace/>
-      </originDescription>
-    </provenance>
-    <oaf:datainfo>
-      <oaf:inferred>false</oaf:inferred>
-      <oaf:deletedbyinference>false</oaf:deletedbyinference>
-      <oaf:trust>0.9</oaf:trust>
-      <oaf:inferenceprovenance/>
-      <oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
-        classname="sysimport:crosswalk:datasetarchive"
-        schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
-    </oaf:datainfo>
-  </about>
-</oai:record>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml
@ -52,8 +52,7 @@
          subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_3534">Protein binding sites</datacite:subject>
      </datacite:subjects>
    </datacite:resource>
-    <!--<dr:CobjCategory type="software">0029</dr:CobjCategory>-->
-    <dr:CobjCategory>0029</dr:CobjCategory>
+    <dr:CobjCategory type="software">0029</dr:CobjCategory>
    <oaf:hostedBy id="rest________::bioTools" name="bio.tools"/>
    <oaf:collectedFrom id="rest________::bioTools" name="bio.tools"/>
    <oaf:dateAccepted>2018-06-06</oaf:dateAccepted>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json
@ -1,55 +0,0 @@
-{
-  "Category": [
-    {
-      "Section": [
-        {
-          "Linklist": {
-            "Link": [
-              {
-                "LinkProvider": {
-                  "Name": "Europe PMC"
-                },
-                "Target": {
-                  "Publisher": {
-                    "Name": "Altmetric"
-                  },
-                  "ImageURL": "https://api.altmetric.com/v1/donut/58578459_64.png",
-                  "Identifier": {
-                    "ID": "https://www.altmetric.com/details/58578459",
-                    "IDScheme": "URL",
-                    "IDURL": "https://www.altmetric.com/details/58578459"
-                  },
-                  "Type": {
-                    "Name": "dataset"
-                  },
-                  "Title": "Optical clumped isotope thermometry of carbon dioxide"
-                },
-                "Source": {
-                  "Identifier": {
-                    "ID": "30886173",
-                    "IDScheme": "PMID"
-                  },
-                  "Type": {
-                    "Name": "literature"
-                  }
-                },
-                "PublicationDate": "06-04-2019",
-                "RelationshipType": {
-                  "Name": "IsReferencedBy"
-                },
-                "ObtainedBy": "ext_links"
-              }
-            ]
-          },
-          "ObtainedBy": "ext_links",
-          "SectionLinkCount": 1,
-          "Tags": [
-            "altmetrics"
-          ]
-        }
-      ],
-      "CategoryLinkCount": 1,
-      "Name": "Altmetric"
-    }
-  ]
-}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json
@ -1,191 +0,0 @@
-{
-  "version": "6.3",
-  "hitCount": 4,
-  "request": {
-    "id": "28818901",
-    "source": "MED"
-  },
-  "dataLinkList": {
-    "Category": [
-      {
-        "Name": "Nucleotide Sequences",
-        "CategoryLinkCount": 3,
-        "Section": [
-          {
-            "ObtainedBy": "tm_accession",
-            "Tags": [
-              "supporting_data"
-            ],
-            "SectionLinkCount": 1,
-            "Linklist": {
-              "Link": [
-                {
-                  "ObtainedBy": "tm_accession",
-                  "PublicationDate": "27-02-2020",
-                  "LinkProvider": {
-                    "Name": "Europe PMC"
-                  },
-                  "RelationshipType": {
-                    "Name": "References"
-                  },
-                  "Source": {
-                    "Type": {
-                      "Name": "literature"
-                    },
-                    "Identifier": {
-                      "ID": "28818901",
-                      "IDScheme": "MED"
-                    }
-                  },
-                  "Target": {
-                    "Type": {
-                      "Name": "dataset"
-                    },
-                    "Identifier": {
-                      "ID": "AP008937",
-                      "IDScheme": "ENA",
-                      "IDURL": "http://identifiers.org/ena.embl/AP008937"
-                    },
-                    "Title": "AP008937",
-                    "Publisher": {
-                      "Name": "Europe PMC"
-                    }
-                  },
-                  "Frequency": 1
-                }
-              ]
-            }
-          },
-          {
-            "ObtainedBy": "submission",
-            "Tags": [
-              "related_data"
-            ],
-            "SectionLinkCount": 2,
-            "CollectionURL": "http://www.ebi.ac.uk/ena/data/search?query=28818901",
-            "Linklist": {
-              "Link": [
-                {
-                  "ObtainedBy": "submission",
-                  "PublicationDate": "25-06-2018",
-                  "LinkProvider": {
-                    "Name": "Europe PMC"
-                  },
-                  "RelationshipType": {
-                    "Name": "IsReferencedBy"
-                  },
-                  "Source": {
-                    "Type": {
-                      "Name": "literature"
-                    },
-                    "Identifier": {
-                      "ID": "28818901",
-                      "IDScheme": "PMID"
-                    }
-                  },
-                  "Target": {
-                    "Type": {
-                      "Name": "dataset"
-                    },
-                    "Identifier": {
-                      "ID": "NIWV01000000",
-                      "IDScheme": "ENA",
-                      "IDURL": "http://www.ebi.ac.uk/ena/data/view/NIWV01000000"
-                    },
-                    "Title": "Nucleotide sequences",
-                    "Publisher": {
-                      "Name": "ENA"
-                    }
-                  }
-                },
-                {
-                  "ObtainedBy": "submission",
-                  "PublicationDate": "25-06-2018",
-                  "LinkProvider": {
-                    "Name": "Europe PMC"
-                  },
-                  "RelationshipType": {
-                    "Name": "IsReferencedBy"
-                  },
-                  "Source": {
-                    "Type": {
-                      "Name": "literature"
-                    },
-                    "Identifier": {
-                      "ID": "28818901",
-                      "IDScheme": "PMID"
-                    }
-                  },
-                  "Target": {
-                    "Type": {
-                      "Name": "dataset"
-                    },
-                    "Identifier": {
-                      "ID": "PRJNA390617",
-                      "IDScheme": "ENA",
-                      "IDURL": "http://www.ebi.ac.uk/ena/data/view/PRJNA390617"
-                    },
-                    "Title": "Lactobacillus fermentum strain:BFE 6620",
-                    "Publisher": {
-                      "Name": "ENA"
-                    }
-                  }
-                }
-              ]
-            }
-          }
-        ]
-      },
-      {
-        "Name": "BioStudies: supplemental material and supporting data",
-        "CategoryLinkCount": 1,
-        "Section": [
-          {
-            "ObtainedBy": "ext_links",
-            "Tags": [
-              "supporting_data"
-            ],
-            "SectionLinkCount": 1,
-            "Linklist": {
-              "Link": [
-                {
-                  "ObtainedBy": "ext_links",
-                  "PublicationDate": "24-07-2018",
-                  "LinkProvider": {
-                    "Name": "Europe PMC"
-                  },
-                  "RelationshipType": {
-                    "Name": "IsReferencedBy"
-                  },
-                  "Source": {
-                    "Type": {
-                      "Name": "literature"
-                    },
-                    "Identifier": {
-                      "ID": "28818901",
-                      "IDScheme": "PMID"
-                    }
-                  },
-                  "Target": {
-                    "Type": {
-                      "Name": "dataset"
-                    },
-                    "Identifier": {
-                      "ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true",
-                      "IDScheme": "URL",
-                      "IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true"
-                    },
-                    "Title": "Draft Genome Sequence of Lactobacillus fermentum BFE 6620, a Potential Starter Culture for African Vegetable Foods, Isolated from Fermented Cassava.",
-                    "Publisher": {
-                      "Name": "BioStudies: supplemental material and supporting data"
-                    }
-                  }
-                }
-              ]
-            }
-          }
-        ]
-      }
-    ]
-  }
-}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala
@ -5,12 +5,11 @@ import java.time.format.DateTimeFormatter

 import eu.dnetlib.dhp.common.PacePerson
 import eu.dnetlib.dhp.schema.action.AtomicAction
-import eu.dnetlib.dhp.schema.oaf.{Author,  Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty}
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
 import org.codehaus.jackson.map.ObjectMapper
-import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._

 import scala.collection.JavaConverters._

@ -100,20 +99,6 @@ object DLIToOAF {
  )


-  def fixInstance(r:Publication) :Publication = {
-    val collectedFrom = r.getCollectedfrom.asScala.head
-    r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
-    r
-  }
-
-
-  def fixInstanceDataset(r:Dataset) :Dataset = {
-    val collectedFrom = r.getCollectedfrom.asScala.head
-    r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
-    r
-  }
-
-
  def toActionSet(item: Oaf): (String, String) = {
    val mapper = new ObjectMapper()

@ -427,6 +412,46 @@ object DLIToOAF {
  }


+  def generateKeyValue(key: String, value: String): KeyValue = {
+    val kv: KeyValue = new KeyValue()
+    kv.setKey(key)
+    kv.setValue(value)
+    kv.setDataInfo(generateDataInfo("0.9"))
+    kv
+  }


+  def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
+    val di = new DataInfo
+    di.setDeletedbyinference(false)
+    di.setInferred(false)
+    di.setInvisible(false)
+    di.setTrust(trust)
+    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
+    di
+  }
+
+  def createQualifier(cls: String, sch: String): Qualifier = {
+    createQualifier(cls, cls, sch, sch)
+  }
+
+
+  def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
+    val q: Qualifier = new Qualifier
+    q.setClassid(classId)
+    q.setClassname(className)
+    q.setSchemeid(schemeId)
+    q.setSchemename(schemeName)
+    q
+  }
+
+
+  def asField[T](value: T): Field[T] = {
+    val tmp = new Field[T]
+    tmp.setValue(value)
+    tmp
+
+
+  }
+
 }
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala
@ -1,7 +1,7 @@
 package eu.dnetlib.dhp.`export`

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Instance, Publication, Relation, Dataset => OafDataset}
+import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.io.Text
@ -166,13 +166,10 @@ object SparkExportContentForOpenAire {
    }).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS")


-
-    spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS_fixed")
-    spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS_fixed")
-
    val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet)
-    val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet)
-    val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet)
+    val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet)
+    val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet)
+

    fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
  }
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml
@ -16,15 +16,15 @@
        <value>spark2</value>
    </property>
    <property>
-        <name>hive_metastore_uris</name>
+        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
-        <name>hive_jdbc_url</name>
+        <name>hiveJdbcUrl</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
 	<property>
 		<name>oozie.wf.workflow.notification.url</name>
 		<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
 	</property>
-</configuration>
+</configuration>
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
@ -1,18 +0,0 @@
-export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
-export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
-if ! [ -L $link_folder ]
-then
-    rm -Rf "$link_folder"
-    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
-fi
-
-echo "Getting file from " $3
-hdfs dfs -copyToLocal $3
-
-echo "Running impala shell make the new database visible"
-impala-shell -q "INVALIDATE METADATA;"
-
-echo "Running impala shell to compute new table stats"
-impala-shell -d $1 -f $2
-echo "Impala shell finished"
-rm $2
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql
@ -1,8 +1,11 @@
--------------------------------------------------------------
--------------------------------------------------------------
-- Stats database creation
--------------------------------------------------------------
--------------------------------------------------------------
+-- DROP database if EXISTS ${hive_db_name} cascade;
+-- CREATE database ${hive_db_name};
+-- 
+-- CREATE TABLE ${hive_db_name}.Persons ( 
+-- PersonID int, 
+-- LastName varchar(255));
+-- 
+-- INSERT INTO ${hive_db_name}.Persons VALUES (1, "test_db_spyros_rec_111"); 

 DROP database IF EXISTS ${stats_db_name} CASCADE;
 CREATE database ${stats_db_name};
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
@ -1,21 +0,0 @@
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-CREATE OR REPLACE VIEW  ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref;
-CREATE OR REPLACE VIEW  ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country;
-CREATE OR REPLACE VIEW  ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp;
-CREATE OR REPLACE VIEW  ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
-CREATE OR REPLACE VIEW  ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture;
-CREATE OR REPLACE VIEW  ${stats_db_name}.context AS SELECT * FROM ${external_stats_db_name}.context;
-CREATE OR REPLACE VIEW  ${stats_db_name}.category AS SELECT * FROM ${external_stats_db_name}.category;
-CREATE OR REPLACE VIEW  ${stats_db_name}.concept AS SELECT * FROM ${external_stats_db_name}.concept;
-
-
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Creation date of the database
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_1.sql
@ -0,0 +1,7 @@
+----------------------------------------------------------------
+----------------------------------------------------------------
+-- Organization table/view and Organization related tables/views
+----------------------------------------------------------------
+----------------------------------------------------------------
+DROP TABLE IF EXISTS ${stats_db_name}.organization;
+CREATE TABLE ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.country.classid as country from ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_2.sql
@ -0,0 +1 @@
+CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_3.sql
@ -0,0 +1 @@
+CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id FROM ${stats_db_name}.project_organizations;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
@ -1,44 +1,10 @@
----------------------------------------------------------------
----------------------------------------------------------------
-- Post processing - Updates on main tables
----------------------------------------------------------------
----------------------------------------------------------------
-
--Datasource temporary table updates
-UPDATE ${stats_db_name}.datasource_tmp SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd WHERE d.id=rd.datasource);
-
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
-UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication');
-
-DROP TABLE IF EXISTS ${stats_db_name}.project;
-CREATE TABLE ${stats_db_name}.project stored as parquet as
-SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration, 
-CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, 
-CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, 
-CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, 
-CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
-p.callidentifier, p.code
-FROM ${stats_db_name}.project_tmp p 
-LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
-        FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id 
-        WHERE r.type='publication' 
-        GROUP BY pr.id) AS prr1 on prr1.id = p.id
-LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) AS daysForlastPub , count(distinct r.id) AS dp
-        FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r 
-        WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 
-        GROUP BY pp.id) AS prr2
-        ON prr2.id = p.id;
-        
-- Publication temporary table updates
-UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
-
-- Dataset temporary table updates
-UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
-
-- Software temporary table updates
-UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
-
-- Oherresearchproduct temporary table updates
-UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
-
-CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend AS daysfromend FROM  ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id AND result.type='publication' AND project.id=result_projects.project;
+------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------
+-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
+------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------
+CREATE OR REPLACE VIEW  ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref;
+CREATE OR REPLACE VIEW  ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country;
+CREATE OR REPLACE VIEW  ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp;
+CREATE OR REPLACE VIEW  ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
+CREATE OR REPLACE VIEW  ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql
@ -1,38 +0,0 @@
------------------------------------------------------------------------------------------------------
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------
-
-DROP TABLE IF EXISTS ${stats_db_name}.datasource;
-CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * FROM ${stats_db_name}.datasource_tmp;
-
-DROP TABLE IF EXISTS  ${stats_db_name}.publication;
-CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * FROM ${stats_db_name}.publication_tmp;
-
-DROP TABLE IF EXISTS ${stats_db_name}.dataset;
-CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * FROM ${stats_db_name}.dataset_tmp;
-
-DROP TABLE IF EXISTS ${stats_db_name}.software;
-CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * FROM ${stats_db_name}.software_tmp;
-
-DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
-CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * FROM ${stats_db_name}.otherresearchproduct_tmp;
-
-DROP TABLE ${stats_db_name}.project_tmp;
-DROP TABLE ${stats_db_name}.datasource_tmp;
-DROP TABLE ${stats_db_name}.publication_tmp;
-DROP TABLE ${stats_db_name}.dataset_tmp;
-DROP TABLE ${stats_db_name}.software_tmp;
-DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
-
----------------------------------------------
-- Re-creating views from final parquet tables
---------------------------------------------
-
-- Result
-CREATE OR REPLACE VIEW ${stats_db_name}.result AS SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct;
-
-
-------------------------------------------------------------------------------
-- To see with Antonis if the following is needed and where it should be placed
-------------------------------------------------------------------------------
-CREATE TABLE ${stats_db_name}.numbers_country AS SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications FROM ${stats_db_name}.result r, ${stats_db_name}.result_datasources rd, ${stats_db_name}.datasource d, ${stats_db_name}.datasource_organizations dor, ${stats_db_name}.organization org WHERE r.id=rd.id AND rd.datasource=d.id AND d.id=dor.id AND dor.organization=org.id AND r.type='publication' AND r.bestlicence='Open Access' GROUP BY org.country;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_1.sql
@ -0,0 +1,6 @@
+----------------------------
+-- Post processing - Updates
+----------------------------
+
+--Datasource temporary table updates
+UPDATE ${stats_db_name}.datasource_tmp set harvested ='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd where d.id=rd.datasource);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_2.sql
@ -0,0 +1,2 @@
+-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
+UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_3.sql
@ -0,0 +1,20 @@
+DROP TABLE IF EXISTS ${stats_db_name}.project;
+
+CREATE TABLE ${stats_db_name}.project stored as parquet as
+SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration, 
+CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END as haspubs, 
+CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END as numpubs, 
+CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END as daysforlastpub, 
+CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END as delayedpubs,
+p.callidentifier, p.code
+FROM ${stats_db_name}.project_tmp p 
+LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
+        FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id 
+        WHERE r.type='publication' 
+        GROUP BY pr.id) AS prr1 on prr1.id = p.id
+LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) as daysForlastPub , count(distinct r.id) as dp
+        FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r 
+        WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 
+        GROUP BY pp.id) AS prr2
+        on prr2.id = p.id;
+        
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_4.sql
@ -0,0 +1,2 @@
+-- Publication temporary table updates
+UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_5.sql
@ -0,0 +1,2 @@
+-- Dataset temporary table updates
+UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_6.sql
@ -0,0 +1,2 @@
+-- Software temporary table updates
+UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_7.sql
@ -0,0 +1,2 @@
+-- Oherresearchproduct temporary table updates
+UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_8.sql
@ -0,0 +1 @@
+CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend as daysfromend FROM  ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id and result.type='publication' and project.id=result_projects.project;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
@ -1,59 +1,26 @@
------------------------------------------------------
------------------------------------------------------
-- Additional relations
--
-- Sources related tables/views
------------------------------------------------------
------------------------------------------------------
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources as 
-SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
-FROM (
-    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
-from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p 
-LEFT OUTER JOIN
-(
-    SELECT substr(d.id, 4) id 
-    from ${openaire_db_name}.datasource d 
-    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
+------------------------------------------------------------------------------------------------------
+-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
+------------------------------------------------------------------------------------------------------
+
+DROP TABLE IF EXISTS ${stats_db_name}.datasource;
+CREATE TABLE ${stats_db_name}.datasource stored as parquet as select * from ${stats_db_name}.datasource_tmp;
+
+DROP TABLE IF EXISTS  ${stats_db_name}.publication;
+CREATE TABLE ${stats_db_name}.publication stored as parquet as select * from ${stats_db_name}.publication_tmp;
+
+DROP TABLE IF EXISTS ${stats_db_name}.dataset;
+CREATE TABLE ${stats_db_name}.dataset stored as parquet as select * from ${stats_db_name}.dataset_tmp;
+
+DROP TABLE IF EXISTS ${stats_db_name}.software;
+CREATE TABLE ${stats_db_name}.software stored as parquet as select * from ${stats_db_name}.software_tmp;
+
+DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
+CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as select * from ${stats_db_name}.otherresearchproduct_tmp;
+
+DROP TABLE ${stats_db_name}.project_tmp;
+DROP TABLE ${stats_db_name}.datasource_tmp;
+DROP TABLE ${stats_db_name}.publication_tmp;
+DROP TABLE ${stats_db_name}.dataset_tmp;
+DROP TABLE ${stats_db_name}.software_tmp;
+DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;

-CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as 
-SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
-FROM (
-    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
-from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p 
-LEFT OUTER JOIN
-(
-    SELECT substr(d.id, 4) id 
-    from ${openaire_db_name}.datasource d 
-    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
-    
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as 
-SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
-FROM (
-    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
-from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p 
-LEFT OUTER JOIN
-(
-    SELECT substr(d.id, 4) id 
-    from ${openaire_db_name}.datasource d 
-    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
-    
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as 
-SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
-FROM (
-    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
-from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p 
-LEFT OUTER JOIN
-(
-    SELECT substr(d.id, 4) id 
-    from ${openaire_db_name}.datasource d 
-    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
-    
-CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
-SELECT * FROM ${stats_db_name}.publication_sources
-UNION ALL
-SELECT * FROM ${stats_db_name}.dataset_sources
-UNION ALL
-SELECT * FROM ${stats_db_name}.software_sources
-UNION ALL
-SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql
@ -1,49 +1,7 @@
------------------------------------------------------
------------------------------------------------------
-- Additional relations
--
-- Licences related tables/views
------------------------------------------------------
------------------------------------------------------
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS
-SELECT substr(p.id, 4) as id, licenses.value as type 
-from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
-where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
+----------------------------------------------
+-- Re-creating views from final parquet tables
+---------------------------------------------

-CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS
-SELECT substr(p.id, 4) as id, licenses.value as type 
-from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
-where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
+-- Result
+CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence as access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.otherresearchproduct;

-CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS
-SELECT substr(p.id, 4) as id, licenses.value as type 
-from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
-where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS
-SELECT substr(p.id, 4) as id, licenses.value as type 
-from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
-where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
-
-CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
-SELECT * FROM ${stats_db_name}.publication_licenses
-UNION ALL
-SELECT * FROM ${stats_db_name}.dataset_licenses
-UNION ALL
-SELECT * FROM ${stats_db_name}.software_licenses
-UNION ALL
-SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids AS 
-select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid 
-from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources as 
-SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource 
-FROM (
-    SELECT  substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource 
-    from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o 
-    LEFT OUTER JOIN (
-        SELECT substr(d.id, 4) id 
-        from ${openaire_db_name}.datasource d 
-        WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
@ -1,36 +0,0 @@
------------------------------------------------------
------------------------------------------------------
-- Additional relations
--
-- Refereed related tables/views
------------------------------------------------------
------------------------------------------------------
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false;
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false;
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false;
-
-CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false;
-
-CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
-select * from ${stats_db_name}.publication_refereed
-union all
-select * from ${stats_db_name}.dataset_refereed
-union all
-select * from ${stats_db_name}.software_refereed
-union all
-select * from ${stats_db_name}.otherresearchproduct_refereed;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql
@ -1,80 +0,0 @@
----------------------------------------------------
-- Shortcuts for various definitions in stats db ---
----------------------------------------------------
-
-- Peer reviewed:
-- Results that have been collected from Crossref
-create table ${stats_db_name}.result_peerreviewed as
-with peer_reviewed as (
-    select distinct r.id as id
-    from ${stats_db_name}.result r
-    join ${stats_db_name}.result_sources rs on rs.id=r.id
-    join ${stats_db_name}.datasource d on d.id=rs.datasource
-    where d.name='Crossref')
-select distinct peer_reviewed.id as id, true as peer_reviewed
-from peer_reviewed
-union all
-select distinct r.id as id, false as peer_reviewed
-from ${stats_db_name}.result r
-left outer join peer_reviewed pr on pr.id=r.id
-where pr.id is null;
-
-- Green OA:
-- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal.
-create table ${stats_db_name}.result_greenoa as
-with result_green as (
-    select distinct r.id as id
-    from ${stats_db_name}.result r
-    join ${stats_db_name}.result_datasources rd on rd.id=r.id
-    join ${stats_db_name}.datasource d on d.id=rd.datasource
-    left outer join (
-        select rd.id from ${stats_db_name}.result_datasources rd
-        join ${stats_db_name}.datasource d on rd.datasource=d.id
-        join ${stats_db_name}.datasource_sources sds on sds.id=d.id
-        join ${stats_db_name}.datasource sd on sd.id=sds.datasource
-        where sd.name='DOAJ-ARTICLES'
-    ) as doaj on doaj.id=r.id
-    where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null)
-select distinct result_green.id, true as green
-from result_green
-union all
-select distinct r.id as id, false as green
-from ${stats_db_name}.result r
-left outer join result_green rg on rg.id=r.id
-where rg.id  is null;
-
-- GOLD OA:
-- OA results that have been harvested from a DOAJ journal.
-create table ${stats_db_name}.result_gold as
-with result_gold as (
-    select distinct r.id as id
-    from ${stats_db_name}.result r
-    join ${stats_db_name}.result_datasources rd on rd.id=r.id
-    join ${stats_db_name}.datasource d on d.id=rd.datasource
-    join ${stats_db_name}.datasource_sources sds on sds.id=d.id
-    join ${stats_db_name}.datasource sd on sd.id=sds.datasource
-    where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles')
-select distinct result_gold.id, true as gold
-from result_gold
-union all
-select distinct r.id, false as gold
-from ${stats_db_name}.result r
-where r.id not in (select id from result_gold);
-
-- shortcut result-country through the organization affiliation
-create table ${stats_db_name}.result_affiliated_country as
-select r.id as id, o.country as country
-from ${stats_db_name}.result r
-join ${stats_db_name}.result_organization ro on ro.id=r.id
-join ${stats_db_name}.organization o on o.id=ro.organization
-where o.country is not null and o.country!='';
-
-- shortcut result-country through datasource of deposition
-create table ${stats_db_name}.result_deposited_country as
-select r.id as id, o.country as country
-from ${stats_db_name}.result r
-join ${stats_db_name}.result_datasources rd on rd.id=r.id
-join ${stats_db_name}.datasource d on d.id=rd.datasource
-join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
-join ${stats_db_name}.organization o on o.id=dor.organization
-where o.country is not null and o.country!='';
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
@ -1,55 +0,0 @@
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
-- peer reviewed)
-drop table if exists ${stats_db_name}.result_tmp;
-CREATE TABLE ${stats_db_name}.result_tmp (
-    id STRING,
-    title STRING,
-    publisher STRING,
-    journal STRING,
-    `date` STRING,
-    `year` INT,
-    bestlicence STRING,
-    access_mode STRING,
-    embargo_end_date STRING,
-    delayed BOOLEAN,
-    authors INT,
-    source STRING,
-    abstract BOOLEAN,
-    type STRING ,
-    peer_reviewed BOOLEAN,
-    green BOOLEAN,
-    gold BOOLEAN)
-clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
-
-insert into ${stats_db_name}.result_tmp
-select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
-FROM ${stats_db_name}.publication r
-LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
-
-insert into ${stats_db_name}.result_tmp
-select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
-FROM ${stats_db_name}.dataset r
-LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
-
-insert into ${stats_db_name}.result_tmp
-select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
-FROM ${stats_db_name}.software r
-LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
-
-insert into ${stats_db_name}.result_tmp
-select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
-FROM ${stats_db_name}.otherresearchproduct r
-LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
-LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
-
-drop table if exists ${stats_db_name}.result;
-drop view if exists ${stats_db_name}.result;
-create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
-drop table ${stats_db_name}.result_tmp;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql
@ -1,163 +0,0 @@
------------------------------------------------------
------------------------------------------------------
-- Shadow schema table exchange
------------------------------------------------------
------------------------------------------------------
-
-- Dropping old views
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.country;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.countrygdp;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_citations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_classifications;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_concepts;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_datasources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_languages;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_licenses;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_pids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_sources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_topics;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_languages;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_organizations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_results;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.fundref;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.numbers_country;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_datasources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_projects;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.project;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_organizations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_citations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_classifications;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_concepts;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_datasources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_languages;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_licenses;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_pids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_sources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_topics;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_citations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_classifications;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_concepts;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_datasources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_languages;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_licenses;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_organization;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_pids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projects;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_sources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_topics;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.rndexpediture;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.roarmap;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_citations;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_classifications;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_concepts;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_datasources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_languages;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_licenses;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_oids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_pids;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_sources;
-DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_topics;
-
-
-- Creating the shadow database, in case it doesn't exist
-CREATE database IF NOT EXISTS ${stats_db_shadow_name};
-
-- Creating new views
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.country AS SELECT * FROM ${stats_db_name}.country;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project AS SELECT * FROM ${stats_db_name}.project;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result AS SELECT * FROM ${stats_db_name}.result;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software AS SELECT * FROM ${stats_db_name}.software;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql
@ -1,81 +0,0 @@
------------------------------------------------------
------------------------------------------------------
-- Impala table statistics - Needed to make the tables
-- visible for impala
------------------------------------------------------
------------------------------------------------------
-
-COMPUTE STATS country;
-COMPUTE STATS countrygdp;
-COMPUTE STATS dataset;
-COMPUTE STATS dataset_citations;
-COMPUTE STATS dataset_classifications;
-COMPUTE STATS dataset_concepts;
-COMPUTE STATS dataset_datasources;
-COMPUTE STATS dataset_languages;
-COMPUTE STATS dataset_oids;
-COMPUTE STATS dataset_pids;
-COMPUTE STATS dataset_sources;
-COMPUTE STATS dataset_topics;
-COMPUTE STATS datasource;
-COMPUTE STATS datasource_languages;
-COMPUTE STATS datasource_oids;
-COMPUTE STATS datasource_organizations;
-COMPUTE STATS datasource_results;
-COMPUTE STATS fundref;
-COMPUTE STATS numbers_country;
-COMPUTE STATS organization;
-COMPUTE STATS organization_datasources;
-COMPUTE STATS organization_projects;
-COMPUTE STATS otherresearchproduct;
-COMPUTE STATS otherresearchproduct_citations;
-COMPUTE STATS otherresearchproduct_classifications;
-COMPUTE STATS otherresearchproduct_concepts;
-COMPUTE STATS otherresearchproduct_datasources;
-COMPUTE STATS otherresearchproduct_languages;
-COMPUTE STATS otherresearchproduct_licenses;
-COMPUTE STATS otherresearchproduct_oids;
-COMPUTE STATS otherresearchproduct_pids;
-COMPUTE STATS otherresearchproduct_sources;
-COMPUTE STATS otherresearchproduct_topics;
-COMPUTE STATS project;
-COMPUTE STATS project_oids;
-COMPUTE STATS project_organizations;
-COMPUTE STATS project_results;
-COMPUTE STATS publication;
-COMPUTE STATS publication_citations;
-COMPUTE STATS publication_classifications;
-COMPUTE STATS publication_concepts;
-COMPUTE STATS publication_datasources;
-COMPUTE STATS publication_languages;
-COMPUTE STATS publication_licenses;
-COMPUTE STATS publication_oids;
-COMPUTE STATS publication_pids;
-COMPUTE STATS publication_sources;
-COMPUTE STATS publication_topics;
-COMPUTE STATS result;
-COMPUTE STATS result_citations;
-COMPUTE STATS result_classifications;
-COMPUTE STATS result_concepts;
-COMPUTE STATS result_datasources;
-COMPUTE STATS result_languages;
-COMPUTE STATS result_licenses;
-COMPUTE STATS result_oids;
-COMPUTE STATS result_organization;
-COMPUTE STATS result_pids;
-COMPUTE STATS result_projects;
-COMPUTE STATS result_sources;
-COMPUTE STATS result_topics;
-COMPUTE STATS rndexpediture;
-COMPUTE STATS roarmap;
-COMPUTE STATS software;
-COMPUTE STATS software_citations;
-COMPUTE STATS software_classifications;
-COMPUTE STATS software_concepts;
-COMPUTE STATS software_datasources;
-COMPUTE STATS software_languages;
-COMPUTE STATS software_licenses;
-COMPUTE STATS software_oids;
-COMPUTE STATS software_pids;
-COMPUTE STATS software_sources;
-COMPUTE STATS software_topics;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
@ -1,35 +0,0 @@
--------------------------------------------------------------
--------------------------------------------------------------
-- Publication table/view and Publication related tables/views
--------------------------------------------------------------
--------------------------------------------------------------
-
-- Publication temporary table
-DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp;
-
-CREATE TABLE ${stats_db_name}.publication_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
-
-INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , 
-p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
-p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
-case when size(p.description) > 0 then true else false end as abstract,
-'publication' as type
-from ${openaire_db_name}.publication p
-where p.datainfo.deletedbyinference=false;
-
-CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
-
-CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context;
-
-CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
-
-CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p;
-
-CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids;
-
-CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid;
-
-CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject;
-
-- Publication_citations
-CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_1.sql
@ -0,0 +1,10 @@
+--------------------------------------------------------------
+--------------------------------------------------------------
+-- 2. Publication table/view and Publication related tables/views
+--------------------------------------------------------------
+--------------------------------------------------------------
+
+-- Publication temporary table
+DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp;
+
+CREATE TABLE ${stats_db_name}.publication_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_2.sql
@ -0,0 +1,19 @@
+-- The following throws the following exception on CRN HUE Hive:
+-- Error while compiling statement: FAILED: SemanticException [Error 10011]: Line 2:34 Invalid function 'date_format'
+-- But runs OK on OCEAN HUE Hive
+
+INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , 
+p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
+p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
+case when size(p.description) > 0 then true else false end as abstract,
+'publication' as type
+from ${openaire_db_name}.publication p
+where p.datainfo.deletedbyinference=false;
+
+-- INSERT INTO ${hive_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal,
+-- p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
+-- p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
+-- case when size(p.description) > 0 then true else false end as abstract,
+-- 'publication' as type
+-- from openaire.publication p
+-- where p.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_3.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_4.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_5.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_6.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.publication p;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_7.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids as ids;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_8.sql
@ -0,0 +1 @@
+create table ${stats_db_name}.publication_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.publication p lateral view explode(p.pid) pids as ppid;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_9.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_9.sql
@ -0,0 +1 @@
+create table ${stats_db_name}.publication_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.publication p lateral view explode(p.subject) subjects as subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
@ -1,36 +1,2 @@
------------------------------------------------------
------------------------------------------------------
-- Dataset table/view and Dataset related tables/views
------------------------------------------------------
------------------------------------------------------
-
-- Dataset temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp;
-CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING ) clustered by (id) into 100 buckets stored AS orc tblproperties('transactional'='true');
-
-INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, d.title[0].value AS title, d.publisher.value AS publisher, cast(null AS string) AS journal, 
-d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') AS year, d.bestaccessright.classname AS bestlicence,
-d.embargoenddate.value AS embargo_end_date, false AS delayed, size(d.author) AS authors , concat_ws('\u003B',d.source.value) AS source,
- CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
-'dataset' AS type
-FROM ${openaire_db_name}.dataset d
-WHERE d.datainfo.deletedbyinference=FALSE;
-
-- Dataset_citations
-CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d  LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
-
-CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
-
-CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context;
-
-CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource 
-FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN
-(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id;
-
-CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p;
-
-CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids;
-
-CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid;
-
-CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject;
+-- 3. Publication_citations
+CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
@ -1,36 +0,0 @@
--------------------------------------------------------
--------------------------------------------------------
-- Software table/view and Software related tables/views
--------------------------------------------------------
--------------------------------------------------------
-
-- Software temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.software_tmp;
-CREATE TABLE ${stats_db_name}.software_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
-
-INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, s.title[0].value AS title, s.publisher.value AS publisher, CAST(NULL AS string) AS journal, 
-s.dateofacceptance.value AS DATE, date_format(s.dateofacceptance.value,'yyyy') AS YEAR, s.bestaccessright.classname AS bestlicence,
-s.embargoenddate.value AS embargo_end_date, FALSE AS delayed, SIZE(s.author) AS authors , concat_ws('\u003B',s.source.value) AS source,
- CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
-'software' as type
-from ${openaire_db_name}.software s
-where s.datainfo.deletedbyinference=false;
-
-- Software_citations
-CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s  LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
-
-CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
-
-CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context;
-
-CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM (SELECT  substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource 
-FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN
-(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id;
-
-CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p;
-
-CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids;
-
-CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid;
-
-CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_1.sql
@ -0,0 +1,9 @@
+------------------------------------------------------
+------------------------------------------------------
+-- 4. Dataset table/view and Dataset related tables/views
+------------------------------------------------------
+------------------------------------------------------
+
+-- Dataset temporary table supporting updates
+DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp;
+CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_10.sql
@ -0,0 +1 @@
+create table ${stats_db_name}.dataset_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.dataset p lateral view explode(p.subject) subjects as subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_2.sql
@ -0,0 +1,7 @@
+INSERT INTO ${stats_db_name}.dataset_tmp select substr(d.id, 4) as id, d.title[0].value as title, d.publisher.value as publisher, cast(null as string) as journal, 
+d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') as year, d.bestaccessright.classname as bestlicence,
+d.embargoenddate.value as embargo_end_date, false as delayed, size(d.author) as authors , concat_ws('\u003B',d.source.value) as source,
+ case when size(d.description) > 0 then true else false end as abstract,
+'dataset' as type
+from ${openaire_db_name}.dataset d
+where d.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_3.sql
@ -0,0 +1,2 @@
+-- Dataset_citations
+Create table ${stats_db_name}.dataset_citations as select substr(d.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") as result from ${openaire_db_name}.dataset d  lateral view explode(d.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_4.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_5.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_6.sql
@ -0,0 +1,3 @@
+CREATE TABLE ${stats_db_name}.dataset_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource 
+from ${openaire_db_name}.dataset p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
+(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_7.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.dataset_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.dataset p;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_8.sql
@ -0,0 +1 @@
+CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids as ids;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_9.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_9.sql
@ -0,0 +1 @@
+create table ${stats_db_name}.dataset_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.dataset p lateral view explode(p.pid) pids as ppid;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
@ -1,37 +0,0 @@
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Otherresearchproduct table/view and Otherresearchproduct related tables/views
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-
-- Otherresearchproduct temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp;
-CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp (   id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
-
-INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, o.title[0].value AS title, o.publisher.value AS publisher, CAST(NULL AS string) AS journal, 
-o.dateofacceptance.value AS DATE, date_format(o.dateofacceptance.value,'yyyy') AS year, o.bestaccessright.classname AS bestlicence,
-o.embargoenddate.value as embargo_end_date, FALSE AS delayed, SIZE(o.author) AS authors , concat_ws('\u003B',o.source.value) AS source,
-CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
-'other' AS type 
-FROM ${openaire_db_name}.otherresearchproduct o
-WHERE o.datainfo.deletedbyinference=FALSE;
-
-- Otherresearchproduct_citations
-CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o  LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context;
-
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT  substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource 
-from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
-(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p;
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids;
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid;
-
-CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_1.sql
@ -0,0 +1,9 @@
+--------------------------------------------------------
+--------------------------------------------------------
+-- 5. Software table/view and Software related tables/views
+--------------------------------------------------------
+--------------------------------------------------------
+
+-- Software temporary table supporting updates
+DROP TABLE IF EXISTS ${stats_db_name}.software_tmp;
+CREATE TABLE ${stats_db_name}.software_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_10.sql
@ -0,0 +1 @@
+create table ${stats_db_name}.software_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.software p lateral view explode(p.subject) subjects as subject;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
miconis	d47352cbc7	refactoring of the procedure for the id generation, minor changes and addition of a comparation on the original id and the origin datasource	2020-07-24 20:10:47 +02:00
miconis	b260fee787	implementation of the dedup_id generation using pids to make the graph more stable	2020-07-22 17:29:48 +02:00
				`@ -0,0 +1 @@`
				`CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;`
				`@ -0,0 +1 @@`
				`CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;`
				`@ -0,0 +1 @@`
				`create table ${stats_db_name}.dataset_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.dataset p lateral view explode(p.subject) subjects as subject;`
				`@ -0,0 +1 @@`
				`CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;`
				`@ -0,0 +1 @@`
				`CREATE TABLE ${stats_db_name}.dataset_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.dataset p;`
				`@ -0,0 +1 @@`
				`CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids as ids;`
				`@ -0,0 +1 @@`
				`create table ${stats_db_name}.dataset_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.dataset p lateral view explode(p.pid) pids as ppid;`
				`@ -0,0 +1 @@`
				`create table ${stats_db_name}.software_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.software p lateral view explode(p.subject) subjects as subject;`