added an es prop

fixed typo in module name dhp-worfklow-profiles -> dhp-workflow-profiles
Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop
2020-07-29 14:16:08 +02:00 · 2020-07-28 15:01:58 +02:00 · 2020-07-28 14:10:56 +02:00 · 2020-07-28 14:10:52 +02:00 · 2020-07-28 13:53:12 +02:00 · 2020-07-28 12:02:30 +02:00
145 changed files with 3873 additions and 1399 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java
@ -1,15 +1,22 @@

 package eu.dnetlib.dhp.utils;

-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import java.util.Map;
+
+import javax.xml.ws.BindingProvider;
+
 import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

 public class ISLookupClientFactory {

-	private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
+	private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
+
+	private static int requestTimeout = 60000 * 10;
+	private static int connectTimeout = 60000 * 10;

 	public static ISLookUpService getLookUpService(final String isLookupUrl) {
 		return getServiceStub(ISLookUpService.class, isLookupUrl);
@ -21,6 +28,25 @@ public class ISLookupClientFactory {
 		final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
 		jaxWsProxyFactory.setServiceClass(clazz);
 		jaxWsProxyFactory.setAddress(endpoint);
-		return (T) jaxWsProxyFactory.create();
+
+		final T service = (T) jaxWsProxyFactory.create();
+
+		if (service instanceof BindingProvider) {
+			log
+				.info(
+					"setting timeouts for {} to requestTimeout: {}, connectTimeout: {}",
+					BindingProvider.class.getName(), requestTimeout, connectTimeout);
+
+			Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
+
+			requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout);
+			requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout);
+			requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout);
+			requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout);
+			requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout);
+			requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout);
+		}
+
+		return service;
 	}
 }
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@ -14,6 +14,37 @@

    <description>This module contains common schema classes meant to be used across the dnet-hadoop submodules</description>

+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>4.0.1</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
+
+        </plugins>
+    </build>
+
    <dependencies>

        <dependency>
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
@ -1,8 +1,6 @@

 package eu.dnetlib.dhp.schema.common;

-import java.security.Key;
-
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
@ -0,0 +1,90 @@
+package eu.dnetlib.dhp.schema.scholexplorer
+
+import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
+
+object OafUtils {
+
+
+
+  def generateKeyValue(key: String, value: String): KeyValue = {
+    val kv: KeyValue = new KeyValue()
+    kv.setKey(key)
+    kv.setValue(value)
+    kv.setDataInfo(generateDataInfo("0.9"))
+    kv
+  }
+
+
+  def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
+    val di = new DataInfo
+    di.setDeletedbyinference(false)
+    di.setInferred(false)
+    di.setInvisible(false)
+    di.setTrust(trust)
+    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
+    di
+  }
+
+  def createQualifier(cls: String, sch: String): Qualifier = {
+    createQualifier(cls, cls, sch, sch)
+  }
+
+
+  def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
+    val q: Qualifier = new Qualifier
+    q.setClassid(classId)
+    q.setClassname(className)
+    q.setSchemeid(schemeId)
+    q.setSchemename(schemeName)
+    q
+  }
+
+
+  def asField[T](value: T): Field[T] = {
+    val tmp = new Field[T]
+    tmp.setValue(value)
+    tmp
+
+
+  }
+
+  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
+    val sp = new StructuredProperty
+    sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
+    sp.setValue(value)
+    sp
+
+  }
+
+
+
+  def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
+    val sp = new StructuredProperty
+    sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
+    sp.setValue(value)
+    sp.setDataInfo(dataInfo)
+    sp
+
+  }
+
+  def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
+    val sp = new StructuredProperty
+    sp.setQualifier(createQualifier(classId, schemeId))
+    sp.setValue(value)
+    sp
+
+  }
+
+
+
+  def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
+    val sp = new StructuredProperty
+    sp.setQualifier(createQualifier(classId, schemeId))
+    sp.setValue(value)
+    sp.setDataInfo(dataInfo)
+    sp
+
+  }
+
+
+}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
@ -34,7 +34,10 @@ public class EventFactory {
 		final MappedFields map = createMapFromResult(updateInfo);

 		final String eventId = calculateEventId(
-			updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString());
+			updateInfo.getTopicPath(), updateInfo.getTargetDs().getOpenaireId(), updateInfo
+				.getTarget()
+				.getOpenaireId(),
+			updateInfo.getHighlightValueAsString());

 		res.setEventId(eventId);
 		res.setProducerId(PRODUCER_ID);
@ -93,11 +96,13 @@ public class EventFactory {
 		return map;
 	}

-	private static String calculateEventId(final String topic, final String publicationId, final String value) {
+	private static String calculateEventId(final String topic, final String dsId, final String publicationId,
+		final String value) {
 		return "event-"
-			+ DigestUtils.md5Hex(topic).substring(0, 6) + "-"
-			+ DigestUtils.md5Hex(publicationId).substring(0, 8) + "-"
-			+ DigestUtils.md5Hex(value).substring(0, 8);
+			+ DigestUtils.md5Hex(topic).substring(0, 4) + "-"
+			+ DigestUtils.md5Hex(dsId).substring(0, 4) + "-"
+			+ DigestUtils.md5Hex(publicationId).substring(0, 7) + "-"
+			+ DigestUtils.md5Hex(value).substring(0, 5);
 	}

 	private static long calculateExpiryDate(final long now) {
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java
@ -48,12 +48,13 @@ public class IndexOnESJob {

 		final JavaRDD<String> inputRdd = ClusterUtils
 			.readPath(spark, eventsPath, Event.class)
-			// .limit(10000) // TODO REMOVE
 			.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
 			.javaRDD();

 		final Map<String, String> esCfg = new HashMap<>();
 		// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
+
+		esCfg.put("es.index.auto.create", "false");
 		esCfg.put("es.nodes", indexHost);
 		esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
 		esCfg.put("es.batch.write.retry.count", "8");
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
@ -64,182 +64,11 @@
        </configuration>
    </global>

-    <start to="join_entities_step0"/>
+    <start to="index_es"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-   
-   
-   <action name="join_entities_step0">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>JoinStep0</name>
-            <class>eu.dnetlib.dhp.broker.oa.JoinStep0Job</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
-        <ok to="join_entities_step1"/>
-        <error to="Kill"/>
-    </action>
-    
- <action name="join_entities_step1">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>JoinStep1</name>
-            <class>eu.dnetlib.dhp.broker.oa.JoinStep1Job</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
-        <ok to="join_entities_step2"/>
-        <error to="Kill"/>
-    </action>
-    
-    <action name="join_entities_step2">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>JoinStep2</name>
-            <class>eu.dnetlib.dhp.broker.oa.JoinStep2Job</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
-        <ok to="join_entities_step3"/>
-        <error to="Kill"/>
-    </action>
-    
-    <action name="join_entities_step3">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>JoinStep3</name>
-            <class>eu.dnetlib.dhp.broker.oa.JoinStep3Job</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
-        <ok to="join_entities_step4"/>
-        <error to="Kill"/>
-    </action>
-    
-    <action name="join_entities_step4">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>JoinStep4</name>
-            <class>eu.dnetlib.dhp.broker.oa.JoinStep4Job</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
-        <ok to="prepare_groups"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="prepare_groups">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>PrepareGroupsJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
-        <ok to="generate_events"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="generate_events">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>GenerateEventsJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-			<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
-			<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
-			<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
-        </spark>
-        <ok to="index_es"/>
-        <error to="Kill"/>
-    </action>
    
     <action name="index_es">
        <spark xmlns="uri:oozie:spark-action:0.2">
@ -262,34 +91,10 @@
            <arg>--index</arg><arg>${esIndexName}</arg>
            <arg>--esHost</arg><arg>${esIndexHost}</arg>
        </spark>
-        <ok to="stats"/>
-        <error to="Kill"/>
-       </action>
-    	
-    	<action name="stats">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>GenerateStatsJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.GenerateStatsJob</class>
-            <jar>dhp-broker-events-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-cores=${sparkExecutorCores}
-                --executor-memory=${sparkExecutorMemory}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
-
+    	
    <end name="End"/>

 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -9,6 +9,37 @@

    <artifactId>dhp-graph-mapper</artifactId>

+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>4.0.1</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
+        </plugins>
+
+    </build>
+
    <dependencies>

        <dependency>
@ -61,6 +92,13 @@
            <groupId>org.postgresql</groupId>
            <artifactId>postgresql</artifactId>
        </dependency>
+        <dependency>
+            <groupId>org.json4s</groupId>
+            <artifactId>json4s-jackson_2.11</artifactId>
+            <version>3.5.3</version>
+        </dependency>
+
+

    </dependencies>

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
@ -9,6 +9,7 @@ import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
@ -42,6 +43,12 @@ public class GraphHiveTableImporterJob {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

+		int numPartitions = Optional
+			.ofNullable(parser.get("numPartitions"))
+			.map(Integer::valueOf)
+			.orElse(-1);
+		log.info("numPartitions: {}", numPartitions);
+
 		String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);

@ -60,16 +67,21 @@ public class GraphHiveTableImporterJob {
 		conf.set("hive.metastore.uris", hiveMetastoreUris);

 		runWithSparkHiveSession(
-			conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz));
+			conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz, numPartitions));
 	}

 	// protected for testing
 	private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
-		Class<T> clazz) {
+		Class<T> clazz, int numPartitions) {

-		spark
-			.read()
-			.textFile(inputPath)
+		Dataset<String> dataset = spark.read().textFile(inputPath);
+
+		if (numPartitions > 0) {
+			log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
+			dataset = dataset.repartition(numPartitions);
+		}
+
+		dataset
 			.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java
@ -0,0 +1,162 @@
+
+package eu.dnetlib.dhp.oa.graph.merge;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+/**
+ * Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids
+ * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
+ * by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
+ */
+public class MergeGraphSparkJob {
+
+	private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD
+
+	public static void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				CleanGraphSparkJob.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json"));
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+		parser.parseArgument(args);
+
+		String priority = Optional
+			.ofNullable(parser.get("priority"))
+			.orElse(PRIORITY_DEFAULT);
+		log.info("priority: {}", priority);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		String betaInputPath = parser.get("betaInputPath");
+		log.info("betaInputPath: {}", betaInputPath);
+
+		String prodInputPath = parser.get("prodInputPath");
+		log.info("prodInputPath: {}", prodInputPath);
+
+		String outputPath = parser.get("outputPath");
+		log.info("outputPath: {}", outputPath);
+
+		String graphTableClassName = parser.get("graphTableClassName");
+		log.info("graphTableClassName: {}", graphTableClassName);
+
+		Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
+
+		SparkConf conf = new SparkConf();
+		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+		conf.registerKryoClasses(ModelSupport.getOafModelClasses());
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				removeOutputDir(spark, outputPath);
+				mergeGraphTable(spark, priority, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath);
+			});
+	}
+
+	private static <P extends Oaf, B extends Oaf> void mergeGraphTable(
+		SparkSession spark,
+		String priority,
+		String betaInputPath,
+		String prodInputPath,
+		Class<P> p_clazz,
+		Class<B> b_clazz,
+		String outputPath) {
+
+		Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz);
+		Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz);
+
+		prod
+			.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer")
+			.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
+				Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
+				Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
+				switch (priority) {
+					default:
+					case "BETA":
+						return mergeWithPriorityToBETA(p, b);
+					case "PROD":
+						return mergeWithPriorityToPROD(p, b);
+				}
+			}, Encoders.bean(p_clazz))
+			.filter((FilterFunction<P>) Objects::nonNull)
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(outputPath);
+	}
+
+	private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
+		if (b.isPresent() & !p.isPresent()) {
+			return (P) b.get();
+		}
+		if (p.isPresent()) {
+			return p.get();
+		}
+		return null;
+	}
+
+	private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToBETA(Optional<P> p, Optional<B> b) {
+		if (p.isPresent() & !b.isPresent()) {
+			return p.get();
+		}
+		if (b.isPresent()) {
+			return (P) b.get();
+		}
+		return null;
+	}
+
+	private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath(
+		SparkSession spark, String inputEntityPath, Class<T> clazz) {
+
+		log.info("Reading Graph table from: {}", inputEntityPath);
+		return spark
+			.read()
+			.textFile(inputEntityPath)
+			.map(
+				(MapFunction<String, Tuple2<String, T>>) value -> {
+					final T t = OBJECT_MAPPER.readValue(value, clazz);
+					final String id = ModelSupport.idFn().apply(t);
+					return new Tuple2<>(id, t);
+				},
+				Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
+	}
+
+	private static void removeOutputDir(SparkSession spark, String path) {
+		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+	}
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -1,36 +1,10 @@

 package eu.dnetlib.dhp.oa.graph.raw;

-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.keyValue;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
-import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
+import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;

-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
+import java.util.*;

 import org.apache.commons.lang3.StringUtils;
 import org.dom4j.Document;
@ -40,24 +14,8 @@ import org.dom4j.Node;

 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.LicenseComparator;
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.Context;
-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.Dataset;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.GeoLocation;
-import eu.dnetlib.dhp.schema.oaf.Instance;
-import eu.dnetlib.dhp.schema.oaf.Journal;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
-import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.dhp.schema.oaf.Qualifier;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.oaf.Result;
-import eu.dnetlib.dhp.schema.oaf.Software;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;

 public abstract class AbstractMdRecordToOafMapper {

@ -99,7 +57,6 @@ public abstract class AbstractMdRecordToOafMapper {
 			final Document doc = DocumentHelper
 				.parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));

-			final String type = doc.valueOf("//dr:CobjCategory/@type");
 			final KeyValue collectedFrom = getProvenanceDatasource(
 				doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");

@ -118,12 +75,39 @@ public abstract class AbstractMdRecordToOafMapper {
 			final DataInfo info = prepareDataInfo(doc, invisible);
 			final long lastUpdateTimestamp = new Date().getTime();

-			return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+			final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
+
+			final String type = getResultType(doc, instances);
+
+			return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
 		} catch (final Exception e) {
 			throw new RuntimeException(e);
 		}
 	}

+	protected String getResultType(final Document doc, final List<Instance> instances) {
+		String type = doc.valueOf("//dr:CobjCategory/@type");
+
+		if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
+			String instanceType = instances
+				.stream()
+				.map(i -> i.getInstancetype().getClassid())
+				.findFirst()
+				.map(s -> UNKNOWN.equalsIgnoreCase(s) ? "0000" : s)
+				.orElse("0000"); // Unknown
+			return Optional
+				.ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
+				.map(q -> q.getClassid())
+				.orElse("0000");
+			/*
+			 * .orElseThrow( () -> new IllegalArgumentException( String.format("'%s' not mapped in %s", instanceType,
+			 * DNET_RESULT_TYPOLOGIES)));
+			 */
+		}
+
+		return type;
+	}
+
 	private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
 		final String dsId = doc.valueOf(xpathId);
 		final String dsName = doc.valueOf(xpathName);
@ -138,8 +122,8 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected List<Oaf> createOafs(
 		final Document doc,
 		final String type,
+		final List<Instance> instances,
 		final KeyValue collectedFrom,
-		final KeyValue hostedBy,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {

@ -148,14 +132,14 @@ public abstract class AbstractMdRecordToOafMapper {
 		switch (type.toLowerCase()) {
 			case "publication":
 				final Publication p = new Publication();
-				populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
 				p.setJournal(prepareJournal(doc, info));
 				oafs.add(p);
 				break;
 			case "dataset":
 				final Dataset d = new Dataset();
-				populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
 				d.setStoragedate(prepareDatasetStorageDate(doc, info));
 				d.setDevice(prepareDatasetDevice(doc, info));
@ -168,7 +152,7 @@ public abstract class AbstractMdRecordToOafMapper {
 				break;
 			case "software":
 				final Software s = new Software();
-				populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
 				s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
 				s.setLicense(prepareSoftwareLicenses(doc, info));
@ -180,7 +164,7 @@ public abstract class AbstractMdRecordToOafMapper {
 			case "otherresearchproducts":
 			default:
 				final OtherResearchProduct o = new OtherResearchProduct();
-				populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+				populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
 				o.setResulttype(ORP_DEFAULT_RESULTTYPE);
 				o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
 				o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
@ -259,14 +243,16 @@ public abstract class AbstractMdRecordToOafMapper {
 	private void populateResultFields(
 		final Result r,
 		final Document doc,
+		final List<Instance> instances,
 		final KeyValue collectedFrom,
-		final KeyValue hostedBy,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
 		r.setDataInfo(info);
 		r.setLastupdatetimestamp(lastUpdateTimestamp);
 		r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
-		r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
+
+		r.setOriginalId(Arrays.asList(findOriginalId(doc)));
+
 		r.setCollectedfrom(Arrays.asList(collectedFrom));
 		r.setPid(prepareResultPids(doc, info));
 		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
@ -291,7 +277,7 @@ public abstract class AbstractMdRecordToOafMapper {
 		r.setCoverage(prepareCoverages(doc, info));
 		r.setContext(prepareContexts(doc, info));
 		r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
-		final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
+
 		r.setInstance(instances);
 		r.setBestaccessright(getBestAccessRights(instances));
 	}
@ -429,6 +415,18 @@ public abstract class AbstractMdRecordToOafMapper {
 		return null;
 	}

+	private String findOriginalId(final Document doc) {
+		final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
+		if (n != null) {
+			final String id = n.valueOf("./*[local-name()='identifier']");
+			if (StringUtils.isNotBlank(id)) {
+				return id;
+			}
+		}
+		return doc.valueOf("//*[local-name()='header']/*[local-name()='identifier']");
+
+	}
+
 	protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
 		return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
@ -4,7 +4,11 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.function.Function;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;

 import org.apache.commons.lang3.StringUtils;
@ -57,6 +61,7 @@ public class OafMapperUtils {
 			.stream(values)
 			.map(v -> field(v, info))
 			.filter(Objects::nonNull)
+			.filter(distinctByKey(f -> f.getValue()))
 			.collect(Collectors.toList());
 	}

@ -65,6 +70,7 @@ public class OafMapperUtils {
 			.stream()
 			.map(v -> field(v, info))
 			.filter(Objects::nonNull)
+			.filter(distinctByKey(f -> f.getValue()))
 			.collect(Collectors.toList());
 	}

@ -237,4 +243,10 @@ public class OafMapperUtils {
 	public static String asString(final Object o) {
 		return o == null ? "" : o.toString();
 	}
+
+	public static <T> Predicate<T> distinctByKey(
+		final Function<? super T, ?> keyExtractor) {
+		final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
+		return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
+	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
@ -0,0 +1,89 @@
+package eu.dnetlib.dhp.sx.ebi
+import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
+import org.apache.spark.sql.{Encoder, Encoders}
+import org.apache.spark.sql.expressions.Aggregator
+
+
+
+object EBIAggregator {
+
+  def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
+
+    override def zero: OafDataset = new OafDataset()
+
+    override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
+      b.mergeFrom(a._2)
+      if (b.getId == null)
+        b.setId(a._2.getId)
+      b
+    }
+
+
+    override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
+      wx.mergeFrom(wy)
+      if(wx.getId == null && wy.getId.nonEmpty)
+        wx.setId(wy.getId)
+      wx
+    }
+    override def finish(reduction: OafDataset): OafDataset = reduction
+
+    override def bufferEncoder: Encoder[OafDataset] =
+      Encoders.kryo(classOf[OafDataset])
+
+    override def outputEncoder: Encoder[OafDataset] =
+      Encoders.kryo(classOf[OafDataset])
+  }
+
+
+  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
+
+    override def zero: Publication = new Publication()
+
+    override def reduce(b: Publication, a: (String, Publication)): Publication = {
+      b.mergeFrom(a._2)
+      if (b.getId == null)
+        b.setId(a._2.getId)
+      b
+    }
+
+
+    override def merge(wx: Publication, wy: Publication): Publication = {
+      wx.mergeFrom(wy)
+      if(wx.getId == null && wy.getId.nonEmpty)
+        wx.setId(wy.getId)
+      wx
+    }
+    override def finish(reduction: Publication): Publication = reduction
+
+    override def bufferEncoder: Encoder[Publication] =
+      Encoders.kryo(classOf[Publication])
+
+    override def outputEncoder: Encoder[Publication] =
+      Encoders.kryo(classOf[Publication])
+  }
+
+
+  def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
+
+    override def zero: Relation = new Relation()
+
+    override def reduce(b: Relation, a: (String, Relation)): Relation = {
+      a._2
+    }
+
+
+    override def merge(a: Relation, b: Relation): Relation = {
+      if(b!= null) b else a
+    }
+    override def finish(reduction: Relation): Relation = reduction
+
+    override def bufferEncoder: Encoder[Relation] =
+      Encoders.kryo(classOf[Relation])
+
+    override def outputEncoder: Encoder[Relation] =
+      Encoders.kryo(classOf[Relation])
+  }
+
+
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala
@ -0,0 +1,138 @@
+package eu.dnetlib.dhp.sx.ebi
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf}
+import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
+import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo}
+import eu.dnetlib.dhp.utils.DHPUtils
+import eu.dnetlib.scholexplorer.relation.RelationMapper
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql._
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString}
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+
+object SparkAddLinkUpdates {
+
+  val relationMapper = RelationMapper.load
+
+
+case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
+
+
+  def generatePubmedDLICollectedFrom(): KeyValue = {
+    OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
+  }
+
+
+  def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
+    val pmid :String = input._1
+    val input_json :String = input._2
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: json4s.JValue = parse(input_json)
+
+
+    val targets:List[EBILinks] = for {
+      JObject(link) <- json \\ "Category" \\ "Link"
+      JField("PublicationDate", JString(pubdate)) <- link
+      JField("RelationshipType", JObject(relationshipType)) <- link
+      JField("Name", JString(relname)) <- relationshipType
+      JField("Target", JObject(target)) <- link
+      JField("Identifier", JObject(identifier)) <- target
+      JField("ID", JString(tpid)) <- identifier
+      JField("IDScheme", JString(tpidtype)) <- identifier
+      JField("IDURL", JString(turl)) <- identifier
+      JField("Title", JString(title)) <- target
+      JField("Publisher", JObject(pub)) <- target
+      JField("Name", JString(publisher)) <- pub
+    } yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
+
+
+
+    val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
+
+    targets.flatMap(l => {
+      val relation = new DLIRelation
+      val inverseRelation = new DLIRelation
+      val targetDnetId =  s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
+      val relInfo = relationMapper.get(l.relation.toLowerCase)
+      val relationSemantic = relInfo.getOriginal
+      val inverseRelationSemantic = relInfo.getInverse
+
+      relation.setSource(dnetPublicationId)
+      relation.setTarget(targetDnetId)
+      relation.setRelClass("datacite")
+      relation.setRelType(relationSemantic)
+      relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
+
+      inverseRelation.setSource(targetDnetId)
+      inverseRelation.setTarget(dnetPublicationId)
+      inverseRelation.setRelClass("datacite")
+      inverseRelation.setRelType(inverseRelationSemantic)
+      inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
+
+
+
+      val d = new DLIDataset
+      d.setId(targetDnetId)
+      d.setDataInfo(OafUtils.generateDataInfo())
+      d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava)
+      d.setCompletionStatus("complete")
+      val pi = new ProvenaceInfo
+      pi.setId("dli_________::europe_pmc__")
+      pi.setName( "Europe PMC")
+      pi.setCompletionStatus("complete")
+      pi.setCollectionMode("collected")
+      d.setDlicollectedfrom(List(pi).asJava)
+      d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
+      d.setPublisher(OafUtils.asField(l.publisher))
+      d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava)
+      d.setDateofacceptance(OafUtils.asField(l.pubdate))
+      val i = new Instance
+      i.setCollectedfrom(generatePubmedDLICollectedFrom())
+      i.setDateofacceptance(d.getDateofacceptance)
+      i.setUrl(List(l.turl).asJava)
+      i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
+      d.setInstance(List(i).asJava)
+      List(relation, inverseRelation, d)
+    })
+  }
+
+
+  def main(args: Array[String]): Unit = {
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+
+    val workingPath = parser.get("workingPath")
+    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+    implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation])
+    implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
+
+    val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
+
+    ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
+
+    ds.filter(s => s.isInstanceOf)
+
+
+
+    val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
+
+    oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
+    oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
+
+
+
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala
@ -0,0 +1,49 @@
+package eu.dnetlib.dhp.sx.ebi
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
+
+
+import scala.io.Source
+import scala.xml.pull.XMLEventReader
+
+object SparkCreateBaselineDataFrame {
+
+
+  def main(args: Array[String]): Unit = {
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    val sc = spark.sparkContext
+
+    val workingPath = parser.get("workingPath")
+
+    implicit  val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
+    implicit  val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
+    implicit  val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
+    val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
+
+    val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
+      val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+      new PMParser(xml)
+
+    } ))
+
+    ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
+
+
+
+
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala
@ -0,0 +1,87 @@
+package eu.dnetlib.dhp.sx.ebi
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
+import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
+import eu.dnetlib.scholexplorer.relation.RelationMapper
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
+import org.slf4j.{Logger, LoggerFactory}
+import scala.collection.JavaConverters._
+
+object SparkCreateEBIDataFrame {
+
+
+  def main(args: Array[String]): Unit = {
+    val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    val sc = spark.sparkContext
+
+
+    val workingPath = parser.get("workingPath")
+    val relationMapper = RelationMapper.load
+
+    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+    implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
+    implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
+    implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
+
+    logger.info("Extract Publication and relation from publication_xml")
+    val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
+    {
+      new ObjectMapper().readValue(s, classOf[String])
+    }).flatMap(s => {
+      val d = new PublicationScholexplorerParser
+      d.parseObject(s, relationMapper).asScala.iterator})
+
+    val mapper = new ObjectMapper()
+    mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
+    spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
+
+    logger.info("Extract Publication and relation from dataset_xml")
+    val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
+    {
+      new ObjectMapper().readValue(s, classOf[String])
+    }).flatMap(s => {
+      val d = new DatasetScholexplorerParser
+      d.parseObject(s, relationMapper).asScala.iterator})
+
+    spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
+    val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset])
+    val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication])
+    val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
+    publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
+      .groupByKey(_._1)(Encoders.STRING)
+      .agg(EBIAggregator.getPublicationAggregator().toColumn)
+      .map(p => p._2)
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
+
+    dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
+      .groupByKey(_._1)(Encoders.STRING)
+      .agg(EBIAggregator.getDatasetAggregator().toColumn)
+      .map(p => p._2)
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
+
+    relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
+      .groupByKey(_._1)(Encoders.STRING)
+      .agg(EBIAggregator.getRelationAggregator().toColumn)
+      .map(p => p._2)
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
+
+
+
+    relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.sx.ebi.model;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+public class PMArticle implements Serializable {
+
+	private String pmid;
+	private String date;
+	private PMJournal journal;
+	private String title;
+	private String description;
+	private List<PMAuthor> authors = new ArrayList<>();
+
+	public String getPmid() {
+		return pmid;
+	}
+
+	public void setPmid(String pmid) {
+		this.pmid = pmid;
+	}
+
+	public String getDate() {
+		return date;
+	}
+
+	public void setDate(String date) {
+		this.date = date;
+	}
+
+	public PMJournal getJournal() {
+		return journal;
+	}
+
+	public void setJournal(PMJournal journal) {
+		this.journal = journal;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getDescription() {
+		return description;
+	}
+
+	public void setDescription(String description) {
+		this.description = description;
+	}
+
+	public List<PMAuthor> getAuthors() {
+		return authors;
+	}
+
+	public void setAuthors(List<PMAuthor> authors) {
+		this.authors = authors;
+	}
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java
@ -0,0 +1,31 @@
+
+package eu.dnetlib.dhp.sx.ebi.model;
+
+import java.io.Serializable;
+
+public class PMAuthor implements Serializable {
+
+	private String lastName;
+	private String foreName;
+
+	public String getLastName() {
+		return lastName;
+	}
+
+	public void setLastName(String lastName) {
+		this.lastName = lastName;
+	}
+
+	public String getForeName() {
+		return foreName;
+	}
+
+	public void setForeName(String foreName) {
+		this.foreName = foreName;
+	}
+
+	public String getFullName() {
+		return String.format("%s, %s", this.foreName, this.lastName);
+	}
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java
@ -0,0 +1,53 @@
+
+package eu.dnetlib.dhp.sx.ebi.model;
+
+import java.io.Serializable;
+
+public class PMJournal implements Serializable {
+
+	private String issn;
+	private String volume;
+	private String issue;
+	private String date;
+	private String title;
+
+	public String getIssn() {
+		return issn;
+	}
+
+	public void setIssn(String issn) {
+		this.issn = issn;
+	}
+
+	public String getVolume() {
+		return volume;
+	}
+
+	public void setVolume(String volume) {
+		this.volume = volume;
+	}
+
+	public String getIssue() {
+		return issue;
+	}
+
+	public void setIssue(String issue) {
+		this.issue = issue;
+	}
+
+	public String getDate() {
+		return date;
+	}
+
+	public void setDate(String date) {
+		this.date = date;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala
@ -0,0 +1,92 @@
+package eu.dnetlib.dhp.sx.ebi.model
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
+
+  var currentArticle:PMArticle = generateNextArticle()
+
+  override def hasNext: Boolean = currentArticle!= null
+
+  override def next(): PMArticle = {
+    val tmp = currentArticle
+    currentArticle = generateNextArticle()
+    tmp
+  }
+
+
+  def generateNextArticle():PMArticle = {
+
+    var currentAuthor: PMAuthor = null
+    var currentJournal: PMJournal = null
+    var currNode: String = null
+    var currentYear = "0"
+    var currentMonth = "01"
+    var currentDay = "01"
+
+    while (xml.hasNext) {
+      xml.next match {
+        case EvElemStart(_, label, _, _) =>
+          currNode = label
+          label match {
+            case "PubmedArticle" => currentArticle = new PMArticle
+            case "Author" => currentAuthor = new PMAuthor
+            case "Journal" => currentJournal = new PMJournal
+            case _ =>
+          }
+        case EvElemEnd(_, label) =>
+          label match {
+            case "PubmedArticle" => return currentArticle
+            case "Author" => currentArticle.getAuthors.add(currentAuthor)
+            case "Journal" => currentArticle.setJournal(currentJournal)
+            case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay")
+            case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
+            case _ =>
+          }
+        case EvText(text) =>
+          if (currNode!= null && text.trim.nonEmpty)
+            currNode match {
+              case "ArticleTitle" => {
+                if (currentArticle.getTitle==null)
+                  currentArticle.setTitle(text.trim)
+                else
+                  currentArticle.setTitle(currentArticle.getTitle + text.trim)
+              }
+              case "AbstractText" => {
+                if (currentArticle.getDescription==null)
+                  currentArticle.setDescription(text.trim)
+                else
+                  currentArticle.setDescription(currentArticle.getDescription + text.trim)
+              }
+              case "PMID" => currentArticle.setPmid(text.trim)
+              case "ISSN" => currentJournal.setIssn(text.trim)
+              case "Year" => currentYear = text.trim
+              case "Month" => currentMonth = text.trim
+              case "Day" => currentDay = text.trim
+              case "Volume" => currentJournal.setVolume( text.trim)
+              case "Issue" => currentJournal.setIssue (text.trim)
+              case "LastName" => {
+                if (currentAuthor != null)
+                  currentAuthor.setLastName(text.trim)
+
+              }
+              case "ForeName" => if (currentAuthor != null)
+                currentAuthor.setForeName(text.trim)
+              case "Title" =>
+                if (currentJournal.getTitle==null)
+                  currentJournal.setTitle(text.trim)
+                else
+                  currentJournal.setTitle(currentJournal.getTitle + text.trim)
+              case _ =>
+
+            }
+        case _ =>
+      }
+
+    }
+    null
+  }
+}
+
+
+
+
+
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
@ -150,6 +150,17 @@ public abstract class AbstractScholexplorerParser {
 		return uk;
 	}

+	protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
+		final String schemeName) {
+		final Qualifier q = new Qualifier();
+		q.setClassid(classId);
+		q.setClassid(className);
+		q.setSchemeid(schemeId);
+		q.setSchemename(schemeName);
+		return q;
+
+	}
+
 	protected void generateRelations(
 		RelationMapper relationMapper,
 		Result parsedObject,
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
@ -64,7 +64,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 				currentDate.setQualifier(dateQualifier);
 				parsedObject.setRelevantdate(Collections.singletonList(currentDate));
 			}
-
 			final String completionStatus = VtdUtilityParser
 				.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
 			final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
@ -149,6 +148,37 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 			inferPid(currentPid);
 			parsedObject.setPid(Collections.singletonList(currentPid));

+			String resolvedURL = null;
+
+			switch (currentPid.getQualifier().getClassname().toLowerCase()) {
+				case "uniprot":
+					resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
+					break;
+				case "ena":
+					if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
+						resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
+					break;
+				case "chembl":
+					resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
+					break;
+
+				case "ncbi-n":
+					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
+					break;
+				case "ncbi-p":
+					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
+					break;
+				case "genbank":
+					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
+					break;
+				case "pdb":
+					resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
+					break;
+				case "url":
+					resolvedURL = currentPid.getValue();
+					break;
+			}
+
 			final String sourceId = generateId(
 				currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
 			parsedObject.setId(sourceId);
@ -251,6 +281,11 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 								t -> {
 									final StructuredProperty st = new StructuredProperty();
 									st.setValue(t);
+									st
+										.setQualifier(
+											generateQualifier(
+												"main title", "main title", "dnet:dataCite_title",
+												"dnet:dataCite_title"));
 									return st;
 								})
 							.collect(Collectors.toList()));
@ -282,6 +317,13 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
 							.collect(Collectors.toList()));
 			}

+			if (StringUtils.isNotBlank(resolvedURL)) {
+				Instance i = new Instance();
+				i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
+				i.setUrl(Collections.singletonList(resolvedURL));
+				parsedObject.setInstance(Collections.singletonList(i));
+			}
+
 			result.add(parsedObject);
 			return result;
 		} catch (Throwable e) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
@ -202,6 +202,11 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser
 								t -> {
 									final StructuredProperty st = new StructuredProperty();
 									st.setValue(t);
+									st
+										.setQualifier(
+											generateQualifier(
+												"main title", "main title", "dnet:dataCite_title",
+												"dnet:dataCite_title"));
 									return st;
 								})
 							.collect(Collectors.toList()));
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -282,6 +282,7 @@
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
+            <arg>--numPartitions</arg><arg>100</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json
@ -5,6 +5,12 @@
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
+  {
+    "paramName": "np",
+    "paramLongName": "numPartitions",
+    "paramDescription": "number of dataset partitions",
+    "paramRequired": false
+  },
  {
    "paramName": "in",
    "paramLongName": "inputPath",
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
@ -0,0 +1,293 @@
+<workflow-app name="merge graphs" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>betaInputGgraphPath</name>
+            <description>the beta graph root path</description>
+        </property>
+        <property>
+            <name>prodInputGgraphPath</name>
+            <description>the production graph root path</description>
+        </property>
+        <property>
+            <name>graphOutputPath</name>
+            <description>the output merged graph root path</description>
+        </property>
+        <property>
+            <name>priority</name>
+            <description>decides from which infrastructure the content must win in case of ID clash</description>
+        </property>
+
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+	<start to="fork_merge_graph"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <fork name="fork_merge_graph">
+        <path start="merge_publication"/>
+        <path start="merge_dataset"/>
+        <path start="merge_otherresearchproduct"/>
+        <path start="merge_software"/>
+        <path start="merge_datasource"/>
+        <path start="merge_organization"/>
+        <path start="merge_project"/>
+        <path start="merge_relation"/>
+    </fork>
+
+    <action name="merge_publication">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge publications</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/publication</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/publication</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_dataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge datasets</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/dataset</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_otherresearchproduct">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge otherresearchproducts</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/otherresearchproduct</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/otherresearchproduct</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_software">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge softwares</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/software</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/software</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_datasource">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge datasources</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/datasource</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/datasource</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_organization">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge organizations</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/organization</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/organization</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_project">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge projects</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/project</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/project</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="merge_relation">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge relations</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/relation</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/relation</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_merge" to="End"/>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json
@ -0,0 +1,38 @@
+[
+	{
+		"paramName": "issm",
+		"paramLongName": "isSparkSessionManaged",
+		"paramDescription": "when true will stop SparkSession after job execution",
+		"paramRequired": false
+	},
+	{
+		"paramName": "bin",
+		"paramLongName": "betaInputPath",
+		"paramDescription": "the beta graph root path",
+		"paramRequired": true
+	},
+	{
+		"paramName": "pin",
+		"paramLongName": "prodInputPath",
+		"paramDescription": "the production graph root path",
+		"paramRequired": true
+	},
+	{
+		"paramName": "out",
+		"paramLongName": "outputPath",
+		"paramDescription": "the output merged graph root path",
+		"paramRequired": true
+	},
+	{
+		"paramName": "class",
+		"paramLongName": "graphTableClassName",
+		"paramDescription": "class name moelling the graph table",
+		"paramRequired": true
+	},
+	{
+		"paramName": "pr",
+		"paramLongName": "priority",
+		"paramDescription": "decides from which infrastructure the content must win in case of ID clash",
+		"paramRequired": false
+	}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json
@ -0,0 +1,4 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath",         "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml
@ -0,0 +1,68 @@
+<configuration>
+
+    <!-- OCEAN  -->
+    <!--
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+    -->
+
+    <!-- GARR  -->
+
+    <property>
+        <name>jobTracker</name>
+        <value>yarn</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
+    </property>
+
+
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml
@ -0,0 +1,97 @@
+<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the Working Path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+    </parameters>
+
+    <start to="GenerateUpdates"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+
+    <action name="GenerateBaselineDataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Baselnie DataSet</name>
+
+            <class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=1
+                --driver-memory=${sparkDriverMemory}
+                --executor-cores=${sparkExecutorCores}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--master</arg><arg>yarn</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="GenerateUpdates">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Baselnie DataSet</name>
+
+            <class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=1
+                --driver-memory=${sparkDriverMemory}
+                --executor-cores=${sparkExecutorCores}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--master</arg><arg>yarn</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="CreateEBIDataSet">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create EBI DataSet</name>
+
+            <class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=1000
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--master</arg><arg>yarn</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -5,8 +5,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.mockito.ArgumentMatchers.anyString;
-import static org.mockito.Mockito.when;
+import static org.mockito.Mockito.lenient;

 import java.io.IOException;
 import java.util.List;
@ -20,7 +19,9 @@ import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.junit.jupiter.MockitoExtension;

-import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Author;
@ -31,24 +32,25 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.Software;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

@ExtendWith(MockitoExtension.class)
 public class MappersTest {

+	@Mock
+	private ISLookUpService isLookUpService;
+
 	@Mock
 	private VocabularyGroup vocs;

 	@BeforeEach
 	public void setUp() throws Exception {
-		when(vocs.getTermAsQualifier(anyString(), anyString()))
-			.thenAnswer(
-				invocation -> OafMapperUtils
-					.qualifier(
-						invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
-						invocation.getArgument(0)));
-
-		when(vocs.termExists(anyString(), anyString())).thenReturn(true);
+		lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
+		lenient()
+			.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
+			.thenReturn(synonyms());

+		vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
 	}

 	@Test
@ -68,9 +70,14 @@ public class MappersTest {
 		final Relation r2 = (Relation) list.get(2);

 		assertValidId(p.getId());
+
+		assertTrue(p.getOriginalId().size() == 1);
+		assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
+
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 		assertFalse(p.getDataInfo().getInvisible());
+		assertTrue(p.getSource().size() == 1);

 		assertTrue(p.getAuthor().size() > 0);
 		final Optional<Author> author = p
@ -79,6 +86,7 @@ public class MappersTest {
 			.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
 			.findFirst();
 		assertTrue(author.isPresent());
+
 		final StructuredProperty pid = author
 			.get()
 			.getPid()
@ -169,6 +177,8 @@ public class MappersTest {
 		final Relation r2 = (Relation) list.get(2);

 		assertValidId(d.getId());
+		assertTrue(d.getOriginalId().size() == 1);
+		assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
 		assertValidId(d.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
 		assertTrue(d.getAuthor().size() > 0);
@ -255,10 +265,32 @@ public class MappersTest {
 		assertTrue(s.getInstance().size() > 0);
 	}

+	// @Test
+	void testDataset_2() throws IOException {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml"));
+
+		final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
+
+		System.out.println("***************");
+		System.out.println(new ObjectMapper().writeValueAsString(list));
+		System.out.println("***************");
+	}
+
 	private void assertValidId(final String id) {
 		assertEquals(49, id.length());
 		assertEquals('|', id.charAt(2));
 		assertEquals(':', id.charAt(15));
 		assertEquals(':', id.charAt(16));
 	}
+
+	private List<String> vocs() throws IOException {
+		return IOUtils
+			.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
+	}
+
+	private List<String> synonyms() throws IOException {
+		return IOUtils
+			.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
+	}
+
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala
@ -0,0 +1,20 @@
+package eu.dnetlib.dhp.sx.ebi
+
+import org.junit.jupiter.api.Test
+
+class TestEBI {
+
+
+
+//  @Test
+  def testEBIData() = {
+    SparkAddLinkUpdates.main("-mt local[*] -w /home/sandro/Downloads".split(" "))
+
+
+
+
+
+
+  }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
@ -34,6 +34,8 @@
    <dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
    <dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
    <dc:source>One Ecosystem 2: e13718</dc:source>
+    <dc:source>One Ecosystem 2: e13718</dc:source>
+    <dc:source>One Ecosystem 2: e13718</dc:source>
    <dc:subject>Ecosystem Services hotspots</dc:subject>
    <dc:subject>Natura 2000</dc:subject>
    <dc:subject>Quiet Protected Areas</dc:subject>
@ -47,7 +49,8 @@
    <dc:subject>regulating services</dc:subject>
    <dc:subject>supporting services</dc:subject>
    <dc:type>Research Article</dc:type>
-    <dr:CobjCategory type="publication">0001</dr:CobjCategory>
+    <!--<dr:CobjCategory type="publication">0001</dr:CobjCategory>-->
+    <dr:CobjCategory>0001</dr:CobjCategory>
    <oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
    <oaf:projectid>corda_______::226852</oaf:projectid>
    <oaf:accessrights>OPEN</oaf:accessrights>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml
@ -82,7 +82,8 @@
 <p>All files are in MATLAB .mat format.</p></description>
      </descriptions>
    </resource>
-    <dr:CobjCategory type="dataset">0021</dr:CobjCategory>
+    <!--<dr:CobjCategory type="dataset">0021</dr:CobjCategory>-->
+    <dr:CobjCategory>0021</dr:CobjCategory>
    <oaf:dateAccepted>2019-01-01</oaf:dateAccepted>
    <oaf:accessrights>OPEN</oaf:accessrights>
    <oaf:language>und</oaf:language>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset_2.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset_2.xml
@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<oai:record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+  xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+  xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
+  <oai:header>
+    <dri:objIdentifier>opentrials__::0000bf8e63d3d7e6b88421eabafae3f6</dri:objIdentifier>
+    <dri:recordIdentifier>feabb67c-1fd1-423b-aec6-606d04ce53c6</dri:recordIdentifier>
+    <dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
+    <oaf:datasourceprefix>opentrials__</oaf:datasourceprefix>
+    <dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
+  </oai:header>
+  <oai:metadata>
+    <resource xmlns="http://datacite.org/schema/kernel-3"
+      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
+      <identifier identifierType="URL">https://clinicaltrials.gov/ct2/show/NCT02321059</identifier>
+      <alternateIdentifiers>
+        <alternateIdentifier alternateIdentifierType="URL">http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059</alternateIdentifier>
+        <alternateIdentifier alternateIdentifierType="nct">NCT02321059</alternateIdentifier>
+      </alternateIdentifiers>
+      <creators>
+        <creator>
+          <creatorName>Jensen, Kristian K</creatorName>
+        </creator>
+      </creators>
+      <titles>
+        <title>Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia</title>
+      </titles>
+      <publisher>nct</publisher>
+      <geoLocations>
+        <geoLocationPlace>Denmark</geoLocationPlace>
+      </geoLocations>
+      <resourceType resourceTypeGeneral="clinicalTrial">0037</resourceType>
+      <descriptions>
+        <description descriptionType="Abstract">Patients with an incisional hernia in the midline and controls with an intact abdominal wall are examined twice with one week apart, in order to establish the test-retest reliability and internal and external validity of the Goodstrength trunk dynamometer.</description>
+      </descriptions>
+    </resource>
+    <oaf:accessrights>OPEN</oaf:accessrights>
+    <dr:CobjCategory type="dataset">0037</dr:CobjCategory>
+    <oaf:dateAccepted>2014-11-11</oaf:dateAccepted>
+    <oaf:hostedBy id="openaire____::opentrials" name="OpenTrials"/>
+    <oaf:collectedFrom id="openaire____::opentrials" name="OpenTrials"/>
+    <oaf:about>
+      <oaf:datainfo>
+        <oaf:inferred>false</oaf:inferred>
+        <oaf:deletedbyinference>false</oaf:deletedbyinference>
+        <oaf:trust>0.9</oaf:trust>
+        <oaf:inferenceprovenance/>
+        <oaf:provenanceaction
+          classid="sysimport:crosswalk:datasetarchive"
+          classname="sysimport:crosswalk:datasetarchive"
+          schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+      </oaf:datainfo>
+    </oaf:about>
+  </oai:metadata>
+  <about xmlns:dc="http://purl.org/dc/elements/1.1/"
+    xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+      <originDescription altered="true" harvestDate="2019-03-27T15:15:22.22Z">
+        <baseURL>file:///var/lib/dnet/data/opentrials/opentrials.csv</baseURL>
+        <identifier/>
+        <datestamp/>
+        <metadataNamespace/>
+      </originDescription>
+    </provenance>
+    <oaf:datainfo>
+      <oaf:inferred>false</oaf:inferred>
+      <oaf:deletedbyinference>false</oaf:deletedbyinference>
+      <oaf:trust>0.9</oaf:trust>
+      <oaf:inferenceprovenance/>
+      <oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
+        classname="sysimport:crosswalk:datasetarchive"
+        schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+    </oaf:datainfo>
+  </about>
+</oai:record>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml
@ -52,7 +52,8 @@
          subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_3534">Protein binding sites</datacite:subject>
      </datacite:subjects>
    </datacite:resource>
-    <dr:CobjCategory type="software">0029</dr:CobjCategory>
+    <!--<dr:CobjCategory type="software">0029</dr:CobjCategory>-->
+    <dr:CobjCategory>0029</dr:CobjCategory>
    <oaf:hostedBy id="rest________::bioTools" name="bio.tools"/>
    <oaf:collectedFrom id="rest________::bioTools" name="bio.tools"/>
    <oaf:dateAccepted>2018-06-06</oaf:dateAccepted>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json
@ -0,0 +1,55 @@
+{
+  "Category": [
+    {
+      "Section": [
+        {
+          "Linklist": {
+            "Link": [
+              {
+                "LinkProvider": {
+                  "Name": "Europe PMC"
+                },
+                "Target": {
+                  "Publisher": {
+                    "Name": "Altmetric"
+                  },
+                  "ImageURL": "https://api.altmetric.com/v1/donut/58578459_64.png",
+                  "Identifier": {
+                    "ID": "https://www.altmetric.com/details/58578459",
+                    "IDScheme": "URL",
+                    "IDURL": "https://www.altmetric.com/details/58578459"
+                  },
+                  "Type": {
+                    "Name": "dataset"
+                  },
+                  "Title": "Optical clumped isotope thermometry of carbon dioxide"
+                },
+                "Source": {
+                  "Identifier": {
+                    "ID": "30886173",
+                    "IDScheme": "PMID"
+                  },
+                  "Type": {
+                    "Name": "literature"
+                  }
+                },
+                "PublicationDate": "06-04-2019",
+                "RelationshipType": {
+                  "Name": "IsReferencedBy"
+                },
+                "ObtainedBy": "ext_links"
+              }
+            ]
+          },
+          "ObtainedBy": "ext_links",
+          "SectionLinkCount": 1,
+          "Tags": [
+            "altmetrics"
+          ]
+        }
+      ],
+      "CategoryLinkCount": 1,
+      "Name": "Altmetric"
+    }
+  ]
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json
@ -0,0 +1,191 @@
+{
+  "version": "6.3",
+  "hitCount": 4,
+  "request": {
+    "id": "28818901",
+    "source": "MED"
+  },
+  "dataLinkList": {
+    "Category": [
+      {
+        "Name": "Nucleotide Sequences",
+        "CategoryLinkCount": 3,
+        "Section": [
+          {
+            "ObtainedBy": "tm_accession",
+            "Tags": [
+              "supporting_data"
+            ],
+            "SectionLinkCount": 1,
+            "Linklist": {
+              "Link": [
+                {
+                  "ObtainedBy": "tm_accession",
+                  "PublicationDate": "27-02-2020",
+                  "LinkProvider": {
+                    "Name": "Europe PMC"
+                  },
+                  "RelationshipType": {
+                    "Name": "References"
+                  },
+                  "Source": {
+                    "Type": {
+                      "Name": "literature"
+                    },
+                    "Identifier": {
+                      "ID": "28818901",
+                      "IDScheme": "MED"
+                    }
+                  },
+                  "Target": {
+                    "Type": {
+                      "Name": "dataset"
+                    },
+                    "Identifier": {
+                      "ID": "AP008937",
+                      "IDScheme": "ENA",
+                      "IDURL": "http://identifiers.org/ena.embl/AP008937"
+                    },
+                    "Title": "AP008937",
+                    "Publisher": {
+                      "Name": "Europe PMC"
+                    }
+                  },
+                  "Frequency": 1
+                }
+              ]
+            }
+          },
+          {
+            "ObtainedBy": "submission",
+            "Tags": [
+              "related_data"
+            ],
+            "SectionLinkCount": 2,
+            "CollectionURL": "http://www.ebi.ac.uk/ena/data/search?query=28818901",
+            "Linklist": {
+              "Link": [
+                {
+                  "ObtainedBy": "submission",
+                  "PublicationDate": "25-06-2018",
+                  "LinkProvider": {
+                    "Name": "Europe PMC"
+                  },
+                  "RelationshipType": {
+                    "Name": "IsReferencedBy"
+                  },
+                  "Source": {
+                    "Type": {
+                      "Name": "literature"
+                    },
+                    "Identifier": {
+                      "ID": "28818901",
+                      "IDScheme": "PMID"
+                    }
+                  },
+                  "Target": {
+                    "Type": {
+                      "Name": "dataset"
+                    },
+                    "Identifier": {
+                      "ID": "NIWV01000000",
+                      "IDScheme": "ENA",
+                      "IDURL": "http://www.ebi.ac.uk/ena/data/view/NIWV01000000"
+                    },
+                    "Title": "Nucleotide sequences",
+                    "Publisher": {
+                      "Name": "ENA"
+                    }
+                  }
+                },
+                {
+                  "ObtainedBy": "submission",
+                  "PublicationDate": "25-06-2018",
+                  "LinkProvider": {
+                    "Name": "Europe PMC"
+                  },
+                  "RelationshipType": {
+                    "Name": "IsReferencedBy"
+                  },
+                  "Source": {
+                    "Type": {
+                      "Name": "literature"
+                    },
+                    "Identifier": {
+                      "ID": "28818901",
+                      "IDScheme": "PMID"
+                    }
+                  },
+                  "Target": {
+                    "Type": {
+                      "Name": "dataset"
+                    },
+                    "Identifier": {
+                      "ID": "PRJNA390617",
+                      "IDScheme": "ENA",
+                      "IDURL": "http://www.ebi.ac.uk/ena/data/view/PRJNA390617"
+                    },
+                    "Title": "Lactobacillus fermentum strain:BFE 6620",
+                    "Publisher": {
+                      "Name": "ENA"
+                    }
+                  }
+                }
+              ]
+            }
+          }
+        ]
+      },
+      {
+        "Name": "BioStudies: supplemental material and supporting data",
+        "CategoryLinkCount": 1,
+        "Section": [
+          {
+            "ObtainedBy": "ext_links",
+            "Tags": [
+              "supporting_data"
+            ],
+            "SectionLinkCount": 1,
+            "Linklist": {
+              "Link": [
+                {
+                  "ObtainedBy": "ext_links",
+                  "PublicationDate": "24-07-2018",
+                  "LinkProvider": {
+                    "Name": "Europe PMC"
+                  },
+                  "RelationshipType": {
+                    "Name": "IsReferencedBy"
+                  },
+                  "Source": {
+                    "Type": {
+                      "Name": "literature"
+                    },
+                    "Identifier": {
+                      "ID": "28818901",
+                      "IDScheme": "PMID"
+                    }
+                  },
+                  "Target": {
+                    "Type": {
+                      "Name": "dataset"
+                    },
+                    "Identifier": {
+                      "ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true",
+                      "IDScheme": "URL",
+                      "IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true"
+                    },
+                    "Title": "Draft Genome Sequence of Lactobacillus fermentum BFE 6620, a Potential Starter Culture for African Vegetable Foods, Isolated from Fermented Cassava.",
+                    "Publisher": {
+                      "Name": "BioStudies: supplemental material and supporting data"
+                    }
+                  }
+                }
+              ]
+            }
+          }
+        ]
+      }
+    ]
+  }
+}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala
@ -5,11 +5,12 @@ import java.time.format.DateTimeFormatter

 import eu.dnetlib.dhp.common.PacePerson
 import eu.dnetlib.dhp.schema.action.AtomicAction
-import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty}
+import eu.dnetlib.dhp.schema.oaf.{Author,  Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
 import org.codehaus.jackson.map.ObjectMapper
+import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._

 import scala.collection.JavaConverters._

@ -99,6 +100,20 @@ object DLIToOAF {
  )


+  def fixInstance(r:Publication) :Publication = {
+    val collectedFrom = r.getCollectedfrom.asScala.head
+    r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
+    r
+  }
+
+
+  def fixInstanceDataset(r:Dataset) :Dataset = {
+    val collectedFrom = r.getCollectedfrom.asScala.head
+    r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
+    r
+  }
+
+
  def toActionSet(item: Oaf): (String, String) = {
    val mapper = new ObjectMapper()

@ -412,46 +427,6 @@ object DLIToOAF {
  }


-  def generateKeyValue(key: String, value: String): KeyValue = {
-    val kv: KeyValue = new KeyValue()
-    kv.setKey(key)
-    kv.setValue(value)
-    kv.setDataInfo(generateDataInfo("0.9"))
-    kv
-  }


-  def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
-    val di = new DataInfo
-    di.setDeletedbyinference(false)
-    di.setInferred(false)
-    di.setInvisible(false)
-    di.setTrust(trust)
-    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
-    di
-  }
-
-  def createQualifier(cls: String, sch: String): Qualifier = {
-    createQualifier(cls, cls, sch, sch)
-  }
-
-
-  def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
-    val q: Qualifier = new Qualifier
-    q.setClassid(classId)
-    q.setClassname(className)
-    q.setSchemeid(schemeId)
-    q.setSchemename(schemeName)
-    q
-  }
-
-
-  def asField[T](value: T): Field[T] = {
-    val tmp = new Field[T]
-    tmp.setValue(value)
-    tmp
-
-
-  }
-
 }
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala
@ -1,7 +1,7 @@
 package eu.dnetlib.dhp.`export`

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
+import eu.dnetlib.dhp.schema.oaf.{Instance, Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.io.Text
@ -166,10 +166,13 @@ object SparkExportContentForOpenAire {
    }).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS")


-    val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet)
-    val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet)
-    val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet)

+    spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS_fixed")
+    spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS_fixed")
+
+    val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet)
+    val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet)
+    val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet)

    fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
  }
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml
@ -16,15 +16,15 @@
        <value>spark2</value>
    </property>
    <property>
-        <name>hiveMetastoreUris</name>
+        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
-        <name>hiveJdbcUrl</name>
+        <name>hive_jdbc_url</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
 	<property>
 		<name>oozie.wf.workflow.notification.url</name>
 		<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
 	</property>
-</configuration>
+</configuration>
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
@ -0,0 +1,18 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+echo "Getting file from " $3
+hdfs dfs -copyToLocal $3
+
+echo "Running impala shell make the new database visible"
+impala-shell -q "INVALIDATE METADATA;"
+
+echo "Running impala shell to compute new table stats"
+impala-shell -d $1 -f $2
+echo "Impala shell finished"
+rm $2
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql
@ -1,11 +1,8 @@
-- DROP database if EXISTS ${hive_db_name} cascade;
-- CREATE database ${hive_db_name};
-- 
-- CREATE TABLE ${hive_db_name}.Persons ( 
-- PersonID int, 
-- LastName varchar(255));
-- 
-- INSERT INTO ${hive_db_name}.Persons VALUES (1, "test_db_spyros_rec_111"); 
+--------------------------------------------------------------
+--------------------------------------------------------------
+-- Stats database creation
+--------------------------------------------------------------
+--------------------------------------------------------------

 DROP database IF EXISTS ${stats_db_name} CASCADE;
 CREATE database ${stats_db_name};
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
@ -0,0 +1,21 @@
+------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------
+-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
+------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------
+CREATE OR REPLACE VIEW  ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref;
+CREATE OR REPLACE VIEW  ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country;
+CREATE OR REPLACE VIEW  ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp;
+CREATE OR REPLACE VIEW  ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
+CREATE OR REPLACE VIEW  ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture;
+CREATE OR REPLACE VIEW  ${stats_db_name}.context AS SELECT * FROM ${external_stats_db_name}.context;
+CREATE OR REPLACE VIEW  ${stats_db_name}.category AS SELECT * FROM ${external_stats_db_name}.category;
+CREATE OR REPLACE VIEW  ${stats_db_name}.concept AS SELECT * FROM ${external_stats_db_name}.concept;
+
+
+------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------
+-- Creation date of the database
+------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------
+create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_1.sql
@ -1,7 +0,0 @@
----------------------------------------------------------------
----------------------------------------------------------------
-- Organization table/view and Organization related tables/views
----------------------------------------------------------------
----------------------------------------------------------------
-DROP TABLE IF EXISTS ${stats_db_name}.organization;
-CREATE TABLE ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.country.classid as country from ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_2.sql
@ -1 +0,0 @@
-CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10_3.sql
@ -1 +0,0 @@
-CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id FROM ${stats_db_name}.project_organizations;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
@ -1,10 +1,44 @@
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-CREATE OR REPLACE VIEW  ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref;
-CREATE OR REPLACE VIEW  ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country;
-CREATE OR REPLACE VIEW  ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp;
-CREATE OR REPLACE VIEW  ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
-CREATE OR REPLACE VIEW  ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture;
+----------------------------------------------------------------
+----------------------------------------------------------------
+-- Post processing - Updates on main tables
+----------------------------------------------------------------
+----------------------------------------------------------------
+
+--Datasource temporary table updates
+UPDATE ${stats_db_name}.datasource_tmp SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd WHERE d.id=rd.datasource);
+
+-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
+UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication');
+
+DROP TABLE IF EXISTS ${stats_db_name}.project;
+CREATE TABLE ${stats_db_name}.project stored as parquet as
+SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration, 
+CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, 
+CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, 
+CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, 
+CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
+p.callidentifier, p.code
+FROM ${stats_db_name}.project_tmp p 
+LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
+        FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id 
+        WHERE r.type='publication' 
+        GROUP BY pr.id) AS prr1 on prr1.id = p.id
+LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) AS daysForlastPub , count(distinct r.id) AS dp
+        FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r 
+        WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 
+        GROUP BY pp.id) AS prr2
+        ON prr2.id = p.id;
+        
+-- Publication temporary table updates
+UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
+
+-- Dataset temporary table updates
+UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
+
+-- Software temporary table updates
+UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
+
+-- Oherresearchproduct temporary table updates
+UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
+
+CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend AS daysfromend FROM  ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id AND result.type='publication' AND project.id=result_projects.project;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql
@ -0,0 +1,38 @@
+------------------------------------------------------------------------------------------------------
+-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
+------------------------------------------------------------------------------------------------------
+
+DROP TABLE IF EXISTS ${stats_db_name}.datasource;
+CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * FROM ${stats_db_name}.datasource_tmp;
+
+DROP TABLE IF EXISTS  ${stats_db_name}.publication;
+CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * FROM ${stats_db_name}.publication_tmp;
+
+DROP TABLE IF EXISTS ${stats_db_name}.dataset;
+CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * FROM ${stats_db_name}.dataset_tmp;
+
+DROP TABLE IF EXISTS ${stats_db_name}.software;
+CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * FROM ${stats_db_name}.software_tmp;
+
+DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
+CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * FROM ${stats_db_name}.otherresearchproduct_tmp;
+
+DROP TABLE ${stats_db_name}.project_tmp;
+DROP TABLE ${stats_db_name}.datasource_tmp;
+DROP TABLE ${stats_db_name}.publication_tmp;
+DROP TABLE ${stats_db_name}.dataset_tmp;
+DROP TABLE ${stats_db_name}.software_tmp;
+DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
+
+----------------------------------------------
+-- Re-creating views from final parquet tables
+---------------------------------------------
+
+-- Result
+CREATE OR REPLACE VIEW ${stats_db_name}.result AS SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct;
+
+
+-------------------------------------------------------------------------------
+-- To see with Antonis if the following is needed and where it should be placed
+-------------------------------------------------------------------------------
+CREATE TABLE ${stats_db_name}.numbers_country AS SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications FROM ${stats_db_name}.result r, ${stats_db_name}.result_datasources rd, ${stats_db_name}.datasource d, ${stats_db_name}.datasource_organizations dor, ${stats_db_name}.organization org WHERE r.id=rd.id AND rd.datasource=d.id AND d.id=dor.id AND dor.organization=org.id AND r.type='publication' AND r.bestlicence='Open Access' GROUP BY org.country;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_1.sql
@ -1,6 +0,0 @@
----------------------------
-- Post processing - Updates
----------------------------
-
--Datasource temporary table updates
-UPDATE ${stats_db_name}.datasource_tmp set harvested ='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd where d.id=rd.datasource);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_2.sql
@ -1,2 +0,0 @@
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
-UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_3.sql
@ -1,20 +0,0 @@
-DROP TABLE IF EXISTS ${stats_db_name}.project;
-
-CREATE TABLE ${stats_db_name}.project stored as parquet as
-SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration, 
-CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END as haspubs, 
-CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END as numpubs, 
-CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END as daysforlastpub, 
-CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END as delayedpubs,
-p.callidentifier, p.code
-FROM ${stats_db_name}.project_tmp p 
-LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
-        FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id 
-        WHERE r.type='publication' 
-        GROUP BY pr.id) AS prr1 on prr1.id = p.id
-LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) as daysForlastPub , count(distinct r.id) as dp
-        FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r 
-        WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 
-        GROUP BY pp.id) AS prr2
-        on prr2.id = p.id;
-        
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_4.sql
@ -1,2 +0,0 @@
-- Publication temporary table updates
-UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_5.sql
@ -1,2 +0,0 @@
-- Dataset temporary table updates
-UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_6.sql
@ -1,2 +0,0 @@
-- Software temporary table updates
-UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_7.sql
@ -1,2 +0,0 @@
-- Oherresearchproduct temporary table updates
-UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12_8.sql
@ -1 +0,0 @@
-CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend as daysfromend FROM  ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id and result.type='publication' and project.id=result_projects.project;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
@ -1,26 +1,59 @@
------------------------------------------------------------------------------------------------------
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------
-
-DROP TABLE IF EXISTS ${stats_db_name}.datasource;
-CREATE TABLE ${stats_db_name}.datasource stored as parquet as select * from ${stats_db_name}.datasource_tmp;
-
-DROP TABLE IF EXISTS  ${stats_db_name}.publication;
-CREATE TABLE ${stats_db_name}.publication stored as parquet as select * from ${stats_db_name}.publication_tmp;
-
-DROP TABLE IF EXISTS ${stats_db_name}.dataset;
-CREATE TABLE ${stats_db_name}.dataset stored as parquet as select * from ${stats_db_name}.dataset_tmp;
-
-DROP TABLE IF EXISTS ${stats_db_name}.software;
-CREATE TABLE ${stats_db_name}.software stored as parquet as select * from ${stats_db_name}.software_tmp;
-
-DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
-CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as select * from ${stats_db_name}.otherresearchproduct_tmp;
-
-DROP TABLE ${stats_db_name}.project_tmp;
-DROP TABLE ${stats_db_name}.datasource_tmp;
-DROP TABLE ${stats_db_name}.publication_tmp;
-DROP TABLE ${stats_db_name}.dataset_tmp;
-DROP TABLE ${stats_db_name}.software_tmp;
-DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
+------------------------------------------------------
+------------------------------------------------------
+-- Additional relations
+--
+-- Sources related tables/views
+------------------------------------------------------
+------------------------------------------------------
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources as 
+SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
+FROM (
+    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
+from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p 
+LEFT OUTER JOIN
+(
+    SELECT substr(d.id, 4) id 
+    from ${openaire_db_name}.datasource d 
+    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;

+CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as 
+SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
+FROM (
+    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
+from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p 
+LEFT OUTER JOIN
+(
+    SELECT substr(d.id, 4) id 
+    from ${openaire_db_name}.datasource d 
+    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
+    
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as 
+SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
+FROM (
+    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
+from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p 
+LEFT OUTER JOIN
+(
+    SELECT substr(d.id, 4) id 
+    from ${openaire_db_name}.datasource d 
+    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
+    
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as 
+SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource 
+FROM (
+    SELECT  substr(p.id, 4) as id, substr(datasource, 4) as datasource 
+from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p 
+LEFT OUTER JOIN
+(
+    SELECT substr(d.id, 4) id 
+    from ${openaire_db_name}.datasource d 
+    WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
+    
+CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
+SELECT * FROM ${stats_db_name}.publication_sources
+UNION ALL
+SELECT * FROM ${stats_db_name}.dataset_sources
+UNION ALL
+SELECT * FROM ${stats_db_name}.software_sources
+UNION ALL
+SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql
@ -1,7 +1,49 @@
----------------------------------------------
-- Re-creating views from final parquet tables
---------------------------------------------
+------------------------------------------------------
+------------------------------------------------------
+-- Additional relations
+--
+-- Licences related tables/views
+------------------------------------------------------
+------------------------------------------------------
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS
+SELECT substr(p.id, 4) as id, licenses.value as type 
+from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
+where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;

-- Result
-CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence as access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.otherresearchproduct;
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS
+SELECT substr(p.id, 4) as id, licenses.value as type 
+from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
+where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;

+CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS
+SELECT substr(p.id, 4) as id, licenses.value as type 
+from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
+where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS
+SELECT substr(p.id, 4) as id, licenses.value as type 
+from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
+where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
+
+CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
+SELECT * FROM ${stats_db_name}.publication_licenses
+UNION ALL
+SELECT * FROM ${stats_db_name}.dataset_licenses
+UNION ALL
+SELECT * FROM ${stats_db_name}.software_licenses
+UNION ALL
+SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids AS 
+select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid 
+from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources as 
+SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource 
+FROM (
+    SELECT  substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource 
+    from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o 
+    LEFT OUTER JOIN (
+        SELECT substr(d.id, 4) id 
+        from ${openaire_db_name}.datasource d 
+        WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
@ -0,0 +1,36 @@
+------------------------------------------------------
+------------------------------------------------------
+-- Additional relations
+--
+-- Refereed related tables/views
+------------------------------------------------------
+------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as
+select substr(r.id, 4) as id, inst.refereed.classname as refereed
+from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
+where r.datainfo.deletedbyinference=false;
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as
+select substr(r.id, 4) as id, inst.refereed.classname as refereed
+from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
+where r.datainfo.deletedbyinference=false;
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as
+select substr(r.id, 4) as id, inst.refereed.classname as refereed
+from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
+where r.datainfo.deletedbyinference=false;
+
+CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as
+select substr(r.id, 4) as id, inst.refereed.classname as refereed
+from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
+where r.datainfo.deletedbyinference=false;
+
+CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
+select * from ${stats_db_name}.publication_refereed
+union all
+select * from ${stats_db_name}.dataset_refereed
+union all
+select * from ${stats_db_name}.software_refereed
+union all
+select * from ${stats_db_name}.otherresearchproduct_refereed;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql
@ -0,0 +1,80 @@
+----------------------------------------------------
+-- Shortcuts for various definitions in stats db ---
+----------------------------------------------------
+
+-- Peer reviewed:
+-- Results that have been collected from Crossref
+create table ${stats_db_name}.result_peerreviewed as
+with peer_reviewed as (
+    select distinct r.id as id
+    from ${stats_db_name}.result r
+    join ${stats_db_name}.result_sources rs on rs.id=r.id
+    join ${stats_db_name}.datasource d on d.id=rs.datasource
+    where d.name='Crossref')
+select distinct peer_reviewed.id as id, true as peer_reviewed
+from peer_reviewed
+union all
+select distinct r.id as id, false as peer_reviewed
+from ${stats_db_name}.result r
+left outer join peer_reviewed pr on pr.id=r.id
+where pr.id is null;
+
+-- Green OA:
+-- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal.
+create table ${stats_db_name}.result_greenoa as
+with result_green as (
+    select distinct r.id as id
+    from ${stats_db_name}.result r
+    join ${stats_db_name}.result_datasources rd on rd.id=r.id
+    join ${stats_db_name}.datasource d on d.id=rd.datasource
+    left outer join (
+        select rd.id from ${stats_db_name}.result_datasources rd
+        join ${stats_db_name}.datasource d on rd.datasource=d.id
+        join ${stats_db_name}.datasource_sources sds on sds.id=d.id
+        join ${stats_db_name}.datasource sd on sd.id=sds.datasource
+        where sd.name='DOAJ-ARTICLES'
+    ) as doaj on doaj.id=r.id
+    where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null)
+select distinct result_green.id, true as green
+from result_green
+union all
+select distinct r.id as id, false as green
+from ${stats_db_name}.result r
+left outer join result_green rg on rg.id=r.id
+where rg.id  is null;
+
+-- GOLD OA:
+-- OA results that have been harvested from a DOAJ journal.
+create table ${stats_db_name}.result_gold as
+with result_gold as (
+    select distinct r.id as id
+    from ${stats_db_name}.result r
+    join ${stats_db_name}.result_datasources rd on rd.id=r.id
+    join ${stats_db_name}.datasource d on d.id=rd.datasource
+    join ${stats_db_name}.datasource_sources sds on sds.id=d.id
+    join ${stats_db_name}.datasource sd on sd.id=sds.datasource
+    where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles')
+select distinct result_gold.id, true as gold
+from result_gold
+union all
+select distinct r.id, false as gold
+from ${stats_db_name}.result r
+where r.id not in (select id from result_gold);
+
+-- shortcut result-country through the organization affiliation
+create table ${stats_db_name}.result_affiliated_country as
+select r.id as id, o.country as country
+from ${stats_db_name}.result r
+join ${stats_db_name}.result_organization ro on ro.id=r.id
+join ${stats_db_name}.organization o on o.id=ro.organization
+where o.country is not null and o.country!='';
+
+-- shortcut result-country through datasource of deposition
+create table ${stats_db_name}.result_deposited_country as
+select r.id as id, o.country as country
+from ${stats_db_name}.result r
+join ${stats_db_name}.result_datasources rd on rd.id=r.id
+join ${stats_db_name}.datasource d on d.id=rd.datasource
+join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+join ${stats_db_name}.organization o on o.id=dor.organization
+where o.country is not null and o.country!='';
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
@ -0,0 +1,55 @@
+-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
+-- peer reviewed)
+drop table if exists ${stats_db_name}.result_tmp;
+CREATE TABLE ${stats_db_name}.result_tmp (
+    id STRING,
+    title STRING,
+    publisher STRING,
+    journal STRING,
+    `date` STRING,
+    `year` INT,
+    bestlicence STRING,
+    access_mode STRING,
+    embargo_end_date STRING,
+    delayed BOOLEAN,
+    authors INT,
+    source STRING,
+    abstract BOOLEAN,
+    type STRING ,
+    peer_reviewed BOOLEAN,
+    green BOOLEAN,
+    gold BOOLEAN)
+clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
+
+insert into ${stats_db_name}.result_tmp
+select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
+FROM ${stats_db_name}.publication r
+LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
+
+insert into ${stats_db_name}.result_tmp
+select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
+FROM ${stats_db_name}.dataset r
+LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
+
+insert into ${stats_db_name}.result_tmp
+select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
+FROM ${stats_db_name}.software r
+LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
+
+insert into ${stats_db_name}.result_tmp
+select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
+FROM ${stats_db_name}.otherresearchproduct r
+LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
+LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
+
+drop table if exists ${stats_db_name}.result;
+drop view if exists ${stats_db_name}.result;
+create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
+drop table ${stats_db_name}.result_tmp;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql
@ -0,0 +1,163 @@
+------------------------------------------------------
+------------------------------------------------------
+-- Shadow schema table exchange
+------------------------------------------------------
+------------------------------------------------------
+
+-- Dropping old views
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.country;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.countrygdp;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_citations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_classifications;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_concepts;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_datasources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_languages;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_licenses;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_pids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_sources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_topics;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_languages;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_organizations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_results;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.fundref;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.numbers_country;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_datasources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_projects;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.project;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_organizations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_citations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_classifications;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_concepts;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_datasources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_languages;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_licenses;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_pids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_sources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_topics;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_citations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_classifications;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_concepts;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_datasources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_languages;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_licenses;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_organization;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_pids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projects;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_sources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_topics;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.rndexpediture;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.roarmap;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_citations;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_classifications;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_concepts;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_datasources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_languages;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_licenses;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_oids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_pids;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_sources;
+DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_topics;
+
+
+-- Creating the shadow database, in case it doesn't exist
+CREATE database IF NOT EXISTS ${stats_db_shadow_name};
+
+-- Creating new views
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.country AS SELECT * FROM ${stats_db_name}.country;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project AS SELECT * FROM ${stats_db_name}.project;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result AS SELECT * FROM ${stats_db_name}.result;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software AS SELECT * FROM ${stats_db_name}.software;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
+CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql
@ -0,0 +1,81 @@
+------------------------------------------------------
+------------------------------------------------------
+-- Impala table statistics - Needed to make the tables
+-- visible for impala
+------------------------------------------------------
+------------------------------------------------------
+
+COMPUTE STATS country;
+COMPUTE STATS countrygdp;
+COMPUTE STATS dataset;
+COMPUTE STATS dataset_citations;
+COMPUTE STATS dataset_classifications;
+COMPUTE STATS dataset_concepts;
+COMPUTE STATS dataset_datasources;
+COMPUTE STATS dataset_languages;
+COMPUTE STATS dataset_oids;
+COMPUTE STATS dataset_pids;
+COMPUTE STATS dataset_sources;
+COMPUTE STATS dataset_topics;
+COMPUTE STATS datasource;
+COMPUTE STATS datasource_languages;
+COMPUTE STATS datasource_oids;
+COMPUTE STATS datasource_organizations;
+COMPUTE STATS datasource_results;
+COMPUTE STATS fundref;
+COMPUTE STATS numbers_country;
+COMPUTE STATS organization;
+COMPUTE STATS organization_datasources;
+COMPUTE STATS organization_projects;
+COMPUTE STATS otherresearchproduct;
+COMPUTE STATS otherresearchproduct_citations;
+COMPUTE STATS otherresearchproduct_classifications;
+COMPUTE STATS otherresearchproduct_concepts;
+COMPUTE STATS otherresearchproduct_datasources;
+COMPUTE STATS otherresearchproduct_languages;
+COMPUTE STATS otherresearchproduct_licenses;
+COMPUTE STATS otherresearchproduct_oids;
+COMPUTE STATS otherresearchproduct_pids;
+COMPUTE STATS otherresearchproduct_sources;
+COMPUTE STATS otherresearchproduct_topics;
+COMPUTE STATS project;
+COMPUTE STATS project_oids;
+COMPUTE STATS project_organizations;
+COMPUTE STATS project_results;
+COMPUTE STATS publication;
+COMPUTE STATS publication_citations;
+COMPUTE STATS publication_classifications;
+COMPUTE STATS publication_concepts;
+COMPUTE STATS publication_datasources;
+COMPUTE STATS publication_languages;
+COMPUTE STATS publication_licenses;
+COMPUTE STATS publication_oids;
+COMPUTE STATS publication_pids;
+COMPUTE STATS publication_sources;
+COMPUTE STATS publication_topics;
+COMPUTE STATS result;
+COMPUTE STATS result_citations;
+COMPUTE STATS result_classifications;
+COMPUTE STATS result_concepts;
+COMPUTE STATS result_datasources;
+COMPUTE STATS result_languages;
+COMPUTE STATS result_licenses;
+COMPUTE STATS result_oids;
+COMPUTE STATS result_organization;
+COMPUTE STATS result_pids;
+COMPUTE STATS result_projects;
+COMPUTE STATS result_sources;
+COMPUTE STATS result_topics;
+COMPUTE STATS rndexpediture;
+COMPUTE STATS roarmap;
+COMPUTE STATS software;
+COMPUTE STATS software_citations;
+COMPUTE STATS software_classifications;
+COMPUTE STATS software_concepts;
+COMPUTE STATS software_datasources;
+COMPUTE STATS software_languages;
+COMPUTE STATS software_licenses;
+COMPUTE STATS software_oids;
+COMPUTE STATS software_pids;
+COMPUTE STATS software_sources;
+COMPUTE STATS software_topics;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
@ -0,0 +1,35 @@
+--------------------------------------------------------------
+--------------------------------------------------------------
+-- Publication table/view and Publication related tables/views
+--------------------------------------------------------------
+--------------------------------------------------------------
+
+-- Publication temporary table
+DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp;
+
+CREATE TABLE ${stats_db_name}.publication_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
+
+INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , 
+p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
+p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
+case when size(p.description) > 0 then true else false end as abstract,
+'publication' as type
+from ${openaire_db_name}.publication p
+where p.datainfo.deletedbyinference=false;
+
+CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
+
+CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context;
+
+CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
+
+CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p;
+
+CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids;
+
+CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid;
+
+CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject;
+
+-- Publication_citations
+CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_1.sql
@ -1,10 +0,0 @@
--------------------------------------------------------------
--------------------------------------------------------------
-- 2. Publication table/view and Publication related tables/views
--------------------------------------------------------------
--------------------------------------------------------------
-
-- Publication temporary table
-DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp;
-
-CREATE TABLE ${stats_db_name}.publication_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_2.sql
@ -1,19 +0,0 @@
-- The following throws the following exception on CRN HUE Hive:
-- Error while compiling statement: FAILED: SemanticException [Error 10011]: Line 2:34 Invalid function 'date_format'
-- But runs OK on OCEAN HUE Hive
-
-INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , 
-p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
-p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
-case when size(p.description) > 0 then true else false end as abstract,
-'publication' as type
-from ${openaire_db_name}.publication p
-where p.datainfo.deletedbyinference=false;
-
-- INSERT INTO ${hive_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal,
-- p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
-- p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
-- case when size(p.description) > 0 then true else false end as abstract,
-- 'publication' as type
-- from openaire.publication p
-- where p.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_3.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_4.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_5.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_6.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.publication p;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_7.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids as ids;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_8.sql
@ -1 +0,0 @@
-create table ${stats_db_name}.publication_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.publication p lateral view explode(p.pid) pids as ppid;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_9.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2_9.sql
@ -1 +0,0 @@
-create table ${stats_db_name}.publication_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.publication p lateral view explode(p.subject) subjects as subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
@ -1,2 +1,36 @@
-- 3. Publication_citations
-CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
+------------------------------------------------------
+------------------------------------------------------
+-- Dataset table/view and Dataset related tables/views
+------------------------------------------------------
+------------------------------------------------------
+
+-- Dataset temporary table supporting updates
+DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp;
+CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING ) clustered by (id) into 100 buckets stored AS orc tblproperties('transactional'='true');
+
+INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, d.title[0].value AS title, d.publisher.value AS publisher, cast(null AS string) AS journal, 
+d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') AS year, d.bestaccessright.classname AS bestlicence,
+d.embargoenddate.value AS embargo_end_date, false AS delayed, size(d.author) AS authors , concat_ws('\u003B',d.source.value) AS source,
+ CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
+'dataset' AS type
+FROM ${openaire_db_name}.dataset d
+WHERE d.datainfo.deletedbyinference=FALSE;
+
+-- Dataset_citations
+CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d  LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
+
+CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
+
+CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context;
+
+CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource 
+FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN
+(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id;
+
+CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p;
+
+CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids;
+
+CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid;
+
+CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
@ -0,0 +1,36 @@
+--------------------------------------------------------
+--------------------------------------------------------
+-- Software table/view and Software related tables/views
+--------------------------------------------------------
+--------------------------------------------------------
+
+-- Software temporary table supporting updates
+DROP TABLE IF EXISTS ${stats_db_name}.software_tmp;
+CREATE TABLE ${stats_db_name}.software_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
+
+INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, s.title[0].value AS title, s.publisher.value AS publisher, CAST(NULL AS string) AS journal, 
+s.dateofacceptance.value AS DATE, date_format(s.dateofacceptance.value,'yyyy') AS YEAR, s.bestaccessright.classname AS bestlicence,
+s.embargoenddate.value AS embargo_end_date, FALSE AS delayed, SIZE(s.author) AS authors , concat_ws('\u003B',s.source.value) AS source,
+ CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
+'software' as type
+from ${openaire_db_name}.software s
+where s.datainfo.deletedbyinference=false;
+
+-- Software_citations
+CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s  LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
+
+CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
+
+CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context;
+
+CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM (SELECT  substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource 
+FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN
+(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id;
+
+CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p;
+
+CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids;
+
+CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid;
+
+CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_1.sql
@ -1,9 +0,0 @@
------------------------------------------------------
------------------------------------------------------
-- 4. Dataset table/view and Dataset related tables/views
------------------------------------------------------
------------------------------------------------------
-
-- Dataset temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp;
-CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_10.sql
@ -1 +0,0 @@
-create table ${stats_db_name}.dataset_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.dataset p lateral view explode(p.subject) subjects as subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_2.sql
@ -1,7 +0,0 @@
-INSERT INTO ${stats_db_name}.dataset_tmp select substr(d.id, 4) as id, d.title[0].value as title, d.publisher.value as publisher, cast(null as string) as journal, 
-d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') as year, d.bestaccessright.classname as bestlicence,
-d.embargoenddate.value as embargo_end_date, false as delayed, size(d.author) as authors , concat_ws('\u003B',d.source.value) as source,
- case when size(d.description) > 0 then true else false end as abstract,
-'dataset' as type
-from ${openaire_db_name}.dataset d
-where d.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_3.sql
@ -1,2 +0,0 @@
-- Dataset_citations
-Create table ${stats_db_name}.dataset_citations as select substr(d.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") as result from ${openaire_db_name}.dataset d  lateral view explode(d.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_4.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_5.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_6.sql
@ -1,3 +0,0 @@
-CREATE TABLE ${stats_db_name}.dataset_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource 
-from ${openaire_db_name}.dataset p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
-(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_7.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.dataset_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.dataset p;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_8.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids as ids;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_9.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4_9.sql
@ -1 +0,0 @@
-create table ${stats_db_name}.dataset_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.dataset p lateral view explode(p.pid) pids as ppid;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
@ -0,0 +1,37 @@
+--------------------------------------------------------------------------------
+--------------------------------------------------------------------------------
+-- Otherresearchproduct table/view and Otherresearchproduct related tables/views
+--------------------------------------------------------------------------------
+--------------------------------------------------------------------------------
+
+-- Otherresearchproduct temporary table supporting updates
+DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp;
+CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp (   id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
+
+INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, o.title[0].value AS title, o.publisher.value AS publisher, CAST(NULL AS string) AS journal, 
+o.dateofacceptance.value AS DATE, date_format(o.dateofacceptance.value,'yyyy') AS year, o.bestaccessright.classname AS bestlicence,
+o.embargoenddate.value as embargo_end_date, FALSE AS delayed, SIZE(o.author) AS authors , concat_ws('\u003B',o.source.value) AS source,
+CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
+'other' AS type 
+FROM ${openaire_db_name}.otherresearchproduct o
+WHERE o.datainfo.deletedbyinference=FALSE;
+
+-- Otherresearchproduct_citations
+CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o  LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context;
+
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT  substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource 
+from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
+(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p;
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids;
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid;
+
+CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_1.sql
@ -1,9 +0,0 @@
--------------------------------------------------------
--------------------------------------------------------
-- 5. Software table/view and Software related tables/views
--------------------------------------------------------
--------------------------------------------------------
-
-- Software temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.software_tmp;
-CREATE TABLE ${stats_db_name}.software_tmp (id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_10.sql
@ -1 +0,0 @@
-create table ${stats_db_name}.software_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.software p lateral view explode(p.subject) subjects as subject;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_2.sql
@ -1,7 +0,0 @@
-INSERT INTO ${stats_db_name}.software_tmp select substr(s.id, 4) as id, s.title[0].value as title, s.publisher.value as publisher, cast(null as string) as journal, 
-s.dateofacceptance.value as date, date_format(s.dateofacceptance.value,'yyyy') as year, s.bestaccessright.classname as bestlicence,
-s.embargoenddate.value as embargo_end_date, false as delayed, size(s.author) as authors , concat_ws('\u003B',s.source.value) as source,
- case when size(s.description) > 0 then true else false end as abstract,
-'software' as type
-from ${openaire_db_name}.software s
-where s.datainfo.deletedbyinference=false;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_3.sql
@ -1,2 +0,0 @@
-- Software_citations
-Create table ${stats_db_name}.software_citations as select substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") as result from ${openaire_db_name}.software s  lateral view explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_4.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_5.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts as context;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_6.sql
@ -1,3 +0,0 @@
-CREATE TABLE ${stats_db_name}.software_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT  substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource 
-from ${openaire_db_name}.software p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
-(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_7.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.software p;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_8.sql
@ -1 +0,0 @@
-CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids as ids;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_9.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5_9.sql
@ -1 +0,0 @@
-create table ${stats_db_name}.software_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.software p lateral view explode(p.pid) pids as ppid;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@ -0,0 +1,30 @@
+-- noinspection SqlNoDataSourceInspectionForFile
+
+------------------------------------------------------
+------------------------------------------------------
+-- Project table/view and Project related tables/views
+------------------------------------------------------
+------------------------------------------------------
+-- Project_oids Table
+DROP TABLE IF EXISTS ${stats_db_name}.project_oids;
+CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids;
+
+-- Project_organizations Table
+DROP TABLE IF EXISTS ${stats_db_name}.project_organizations;
+CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization';
+
+-- Project_results Table
+DROP TABLE IF EXISTS ${stats_db_name}.project_results;
+CREATE TABLE ${stats_db_name}.project_results AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result FROM ${openaire_db_name}.relation r WHERE r.reltype='resultProject' and r.datainfo.deletedbyinference=false;
+
+-- Project table
+----------------
+-- Creating and populating temporary Project table
+DROP TABLE IF EXISTS ${stats_db_name}.project_tmp;
+CREATE TABLE ${stats_db_name}.project_tmp (id STRING, acronym STRING, title STRING, funder STRING, funding_lvl0 STRING, funding_lvl1 STRING, funding_lvl2 STRING, ec39 STRING, type STRING, startdate STRING, enddate STRING, start_year INT, end_year INT, duration INT, haspubs STRING, numpubs INT, daysforlastpub INT, delayedpubs INT, callidentifier STRING, code STRING) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
+
+INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, p.acronym.value AS acronym, p.title.value AS title, xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, p.ecsc39.value AS ec39, p.contracttype.classname AS type, p.startdate.value AS startdate, p.enddate.value AS enddate,  year(p.startdate.value) AS start_year, year(p.enddate.value) AS end_year, CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, 'no' AS haspubs, 0 AS numpubs, 0 AS daysforlastpub, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference=false;
+
+create table ${stats_db_name}.funder as
+select distinct  xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname
+from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6_1.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6_1.sql
@ -1,9 +0,0 @@
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- 6. Otherresearchproduct table/view and Otherresearchproduct related tables/views
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-
-- Otherresearchproduct temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp;
-CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp (   id STRING,   title STRING,   publisher STRING,   journal STRING,   date STRING,   year STRING,   bestlicence STRING,   embargo_end_date STRING,   delayed BOOLEAN,   authors INT,   source STRING,   abstract BOOLEAN,   type STRING )  clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michele Artini	8ba94833bd	added an es prop	2020-07-29 14:16:08 +02:00
Claudio Atzori	6f11c0496e	fixed typo in module name dhp-worfklow-profiles -> dhp-workflow-profiles	2020-07-28 15:01:58 +02:00
Claudio Atzori	f680eb3e12	Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop	2020-07-28 14:10:56 +02:00
Claudio Atzori	985b360c31	fixed typo in module name dhp-worfklow-profiles -> dhp-workflow-profiles	2020-07-28 14:10:52 +02:00
Claudio Atzori	7fc27bfdd1	Merge pull request 'islookup_timeout' (#30 ) from islookup_timeout into master Thanks, Michele!	2020-07-28 13:53:12 +02:00
Michele Artini	3acd632123	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-07-28 12:02:30 +02:00
Michele Artini	35e6e9c064	tests	2020-07-28 12:02:15 +02:00
Claudio Atzori	2c4196ab22	Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into islookup_timeout	2020-07-27 17:40:58 +02:00
Claudio Atzori	ee832f358e	Merge pull request 'stats_wf_extensions_and_corrections' (#28 ) from spyros/dnet-hadoop:stats_wf_extensions_and_corrections into master Thank you Guys! The update workflow will be made available to the beta & production orchestration systems under the HDFS path ```/lib/dnet/oa/graph/stats/oozie_app```	2020-07-27 16:02:03 +02:00
Antonis Lempesis	4ac8ebe427	correctly calculating the project duration	2020-07-24 19:50:40 +03:00
Antonis Lempesis	18d9464b52	creating shadow db only if it not exists...	2020-07-24 19:50:40 +03:00
Antonis Lempesis	e217d496ab	added the dest db...	2020-07-24 19:50:40 +03:00
Antonis Lempesis	b16bb68b9f	added the target db name...	2020-07-24 19:50:40 +03:00
Antonis Lempesis	1ee7eeedf3	added the source db name...	2020-07-24 19:50:40 +03:00
Antonis Lempesis	cecbbfa0fc	added missing tables and views: contexts, creation_date, funder	2020-07-24 19:50:40 +03:00
Antonis Lempesis	25b7a615f5	moved datasource_sources table creating in the datasource section	2020-07-24 19:50:40 +03:00
Antonis Lempesis	a8da4ab9c0	years in projects are now integers	2020-07-24 19:50:40 +03:00
Antonis Lempesis	c9cfc165d9	not using impala since the resulting tables are not visible	2020-07-24 19:50:40 +03:00
Antonis Lempesis	dd3d6a6e15	compute stats for the used and new impala tables	2020-07-24 19:50:40 +03:00
Antonis Lempesis	e6f50de6ef	Separated impala from hive steps	2020-07-24 19:50:40 +03:00
Antonis Lempesis	de49173420	fixed a typo in queries	2020-07-24 19:50:40 +03:00
antleb	391cf80fb8	Added peer-reviewed, green, gold tables and fields in result. Added shortcuts from result-country	2020-07-24 19:50:40 +03:00
antleb	68389d0125	Corrected the script used by the last step of the wf	2020-07-24 19:50:40 +03:00
antleb	ec52141f1a	changed refereed type from value to clssname	2020-07-24 19:50:40 +03:00
Spyros Zoupanos	63cd797aba	Comment out step 15 to make it work with the new schema of Claudio	2020-07-24 19:50:40 +03:00
Spyros Zoupanos	138c6ddffa	Insert statement to datasource table that takes into account the piwik_id of the openAIRE graph	2020-07-24 19:50:40 +03:00
Spyros Zoupanos	3630794cef	Fix to consider the relationships that have been 'virtually deleted' for project_results - defect #5607	2020-07-24 19:50:40 +03:00
Spyros Zoupanos	5546f29e63	Corrections on the shadow schema and the impala table stats calculation	2020-07-24 19:50:40 +03:00
Spyros Zoupanos	adf8a025d2	Adding more relations (Sources, Licences, Additional) and shadow schema as provided and discussed with Antonis Lempesis	2020-07-24 19:50:40 +03:00
Spyros Zoupanos	657a40536b	Corrections by Spyros: Scipt cleanup, corrections and re-arrangement	2020-07-24 19:50:40 +03:00
Giorgos Alexiou	477fa6234d	Script re-organisation and adding table invalidations needed for impala	2020-07-24 19:50:40 +03:00
Claudio Atzori	56bbfdc65d	introduced parameter 'numParitions', driving the hive DB table data partitioning. Currently specified only for table 'project'	2020-07-23 08:54:10 +02:00
Sandro La Bruzzo	9ab594ccf6	fixed test	2020-07-21 10:36:21 +02:00
Claudio Atzori	ebf60020ac	map results as OPRs in case of missing //CobjCategory/@type and the vocabulary dnet:result_typologies doesn't resolve the super type	2020-07-20 19:01:10 +02:00
Claudio Atzori	32f5e466e3	imports cleanup	2020-07-20 17:42:58 +02:00
Claudio Atzori	54ac583923	code formatting	2020-07-20 17:37:08 +02:00
Claudio Atzori	124e7ce19c	in case of missing attribute //dr:CobjCategory/@type the resulttype is derived by looking up the vocabulary dnet:result_typologies with the 1st instance type available	2020-07-20 17:33:37 +02:00
Claudio Atzori	050dda223d	Merge pull request 'removed duplicated fields' (#25 ) from unique_field_in_lists into master Looks good as a temporary workaround. I agree the model could seamlessly make the distinct operation by using HashSets instead of Linked (or Array) Lists. The task to update the model in such a way is added on #9#issuecomment-1583 Thanks!	2020-07-20 12:12:50 +02:00
Claudio Atzori	e0c4cf6f7b	added parameter to drive the graph merge strategy: priority (BETA\|PROD)	2020-07-20 10:48:01 +02:00
Claudio Atzori	94ccdb4852	Merge branch 'master' into merge_graph	2020-07-20 10:14:55 +02:00
Claudio Atzori	0937c9998f	Merge branch 'deduptesting'	2020-07-20 10:00:20 +02:00
Michele Artini	331a3cbdd0	fixed originalId	2020-07-20 09:50:29 +02:00
Michele Artini	c59c5369b1	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-07-18 09:40:54 +02:00
Michele Artini	346a1d2b5a	update eventId generator	2020-07-18 09:40:36 +02:00
Sandro La Bruzzo	9116d75b3e	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-07-17 18:01:30 +02:00
Miriam Baglioni	47c7122773	changed priority from beta to production	2020-07-17 12:56:35 +02:00
Michele Artini	442f30930c	removed duplicated fields	2020-07-17 12:25:36 +02:00
Claudio Atzori	1781609508	code formatting	2020-07-16 19:06:56 +02:00
Claudio Atzori	db8b90a156	renamed CORE -> BETA	2020-07-16 19:05:13 +02:00
Claudio Atzori	878f2b931c	Merge branch 'master' into merge_graph	2020-07-16 16:34:24 +02:00
Sandro La Bruzzo	c01efed79b	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-07-10 14:44:57 +02:00
Sandro La Bruzzo	a7d3977481	added generation of EBI Dataset	2020-07-10 14:44:50 +02:00
Claudio Atzori	610d377d57	first implementation of the BETA & PROD graphs merge procedure	2020-07-08 16:54:26 +02:00
Sandro La Bruzzo	18b9330312	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-07-02 12:43:19 +02:00
Sandro La Bruzzo	07f0723fa7	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-07-02 12:37:49 +02:00
Sandro La Bruzzo	1d420eedb4	added generation of EBI Dataset	2020-07-02 12:37:43 +02:00
Sandro La Bruzzo	dab783b173	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-06-29 09:05:00 +02:00
Claudio Atzori	74da8a08cf	Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into islookup_timeout	2020-06-26 14:30:07 +02:00
Sandro La Bruzzo	96ce124b59	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-06-25 17:00:43 +02:00
Claudio Atzori	93052ae384	WIP: set the connect & request timeout for BindingProvider service implementation	2020-06-25 16:16:02 +02:00
Sandro La Bruzzo	96689a8994	Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop	2020-06-24 14:06:50 +02:00
Sandro La Bruzzo	46631a4421	updated mapping scholexplorer to OAF	2020-06-24 14:06:38 +02:00
				`@ -1 +0,0 @@`
				`CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;`
				`@ -1 +0,0 @@`
				`CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;`
				`@ -1 +0,0 @@`
				`create table ${stats_db_name}.dataset_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.dataset p lateral view explode(p.subject) subjects as subject;`
				`@ -1 +0,0 @@`
				`CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;`
				`@ -1 +0,0 @@`
				`CREATE TABLE ${stats_db_name}.dataset_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.dataset p;`
				`@ -1 +0,0 @@`
				`CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids as ids;`
				`@ -1 +0,0 @@`
				`create table ${stats_db_name}.dataset_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.dataset p lateral view explode(p.pid) pids as ppid;`
				`@ -1 +0,0 @@`
				`create table ${stats_db_name}.software_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.software p lateral view explode(p.subject) subjects as subject;`