factored out entity grouping implementation, extended to consider results from delegated authorities rather than identical records from other sources

2022-01-19 12:24:52 +01:00 · 2022-01-19 12:24:52 +01:00 · 44a937f4ed
parent 42e8f76778
commit 44a937f4ed
9 changed files with 412 additions and 45 deletions
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DispatchEntitiesSparkJob.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DispatchEntitiesSparkJob.java
@ -1,11 +1,11 @@

-package eu.dnetlib.dhp.oa.dedup;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-
-import java.util.Objects;
-import java.util.Optional;
+package eu.dnetlib.dhp.oa.merge;

+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
@ -17,12 +17,10 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.Objects;
+import java.util.Optional;

-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 public class DispatchEntitiesSparkJob {

@ -38,7 +36,7 @@ public class DispatchEntitiesSparkJob {
 					.requireNonNull(
 						DispatchEntitiesSparkJob.class
 							.getResourceAsStream(
-								"/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json")));
+								"/eu/dnetlib/dhp/oa/graph/group/dispatch_entities_parameters.json")));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
@ -1,15 +1,17 @@

-package eu.dnetlib.dhp.oa.dedup;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Objects;
-import java.util.Optional;
-import java.util.stream.Collectors;
+package eu.dnetlib.dhp.oa.merge;

+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.jayway.jsonpath.Configuration;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+import com.jayway.jsonpath.Option;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
@ -20,21 +22,17 @@ import org.apache.spark.sql.*;
 import org.apache.spark.sql.expressions.Aggregator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.jayway.jsonpath.Configuration;
-import com.jayway.jsonpath.DocumentContext;
-import com.jayway.jsonpath.JsonPath;
-import com.jayway.jsonpath.Option;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.schema.common.ModelSupport;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;

+import java.io.IOException;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
+
 /**
 * Groups the graph content by entity identifier to ensure ID uniqueness
 */
@ -53,7 +51,7 @@ public class GroupEntitiesSparkJob {
 			.toString(
 				GroupEntitiesSparkJob.class
 					.getResourceAsStream(
-						"/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json"));
+						"/eu/dnetlib/dhp/oa/graph/group/group_graph_entities_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
@ -47,6 +47,17 @@ public class OafMapperUtils {
 	}

 	public static Result mergeResults(Result left, Result right) {
+
+		final boolean leftFromDeletedAuthority = isFromDeletedAuthority(left);
+		final boolean rightFromDeletedAuthority = isFromDeletedAuthority(right);
+
+		if (leftFromDeletedAuthority && !rightFromDeletedAuthority) {
+			return left;
+		}
+		if (!leftFromDeletedAuthority && rightFromDeletedAuthority) {
+			return right;
+		}
+
 		if (new ResultTypeComparator().compare(left, right) < 0) {
 			left.mergeFrom(right);
 			return left;
@ -56,6 +67,13 @@ public class OafMapperUtils {
 		}
 	}

+	private static boolean isFromDeletedAuthority(Result r) {
+		return r.getInstance()
+				.stream()
+				.map(i -> i.getCollectedfrom().getKey())
+				.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId));
+	}
+
 	public static KeyValue keyValue(final String k, final String v) {
 		final KeyValue kv = new KeyValue();
 		kv.setKey(k);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
@ -104,7 +104,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>group graph entities</name>
-            <class>eu.dnetlib.dhp.oa.dedup.GroupEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -138,7 +138,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch publications</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -163,7 +163,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch project</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -188,7 +188,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch organization</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -213,7 +213,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch publication</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -238,7 +238,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch dataset</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -263,7 +263,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch software</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -288,7 +288,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dispatch otherresearchproduct</name>
-            <class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/dispatch_entities_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/dispatch_entities_parameters.json
@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "i",
+    "paramLongName": "inputPath",
+    "paramDescription": "the source path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "outputPath",
+    "paramDescription": "path of the output graph",
+    "paramRequired": true
+  },
+  {
+    "paramName": "c",
+    "paramLongName": "graphTableClassName",
+    "paramDescription": "the graph entity class name",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/group_graph_entities_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/group_graph_entities_parameters.json
@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "gin",
+    "paramLongName": "graphInputPath",
+    "paramDescription": "the graph root path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the output merged graph root path",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml
@ -0,0 +1,289 @@
+<workflow-app name="Group Graph Entities" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>graphBasePath</name>
+            <description>the input graph base path</description>
+        </property>
+        <property>
+            <name>workingPath</name>
+            <description>path of the working directory</description>
+        </property>
+        <property>
+            <name>graphOutputPath</name>
+            <description>path of the output graph</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="group_entities"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="group_entities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>group graph entities</name>
+            <class>eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=15000
+            </spark-opts>
+            <arg>--graphInputPath</arg><arg>${graphBasePath}</arg>
+            <arg>--outputPath</arg><arg>${workingPath}/grouped_entities</arg>
+        </spark>
+        <ok to="fork_dispatch_entities"/>
+        <error to="Kill"/>
+    </action>
+
+    <fork name="fork_dispatch_entities">
+        <path start="dispatch_datasource"/>
+        <path start="dispatch_project"/>
+        <path start="dispatch_organization"/>
+        <path start="dispatch_publication"/>
+        <path start="dispatch_dataset"/>
+        <path start="dispatch_software"/>
+        <path start="dispatch_otherresearchproduct"/>
+    </fork>
+
+    <action name="dispatch_datasource">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch publications</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="dispatch_project">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch project</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="dispatch_organization">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch organization</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="dispatch_publication">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch publication</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="dispatch_dataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch dataset</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="dispatch_software">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch software</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="dispatch_otherresearchproduct">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dispatch otherresearchproduct</name>
+            <class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingPath}/grouped_entities</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+        </spark>
+        <ok to="wait_dispatch"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_dispatch" to="End"/>
+
+    <end name="End"/>
+</workflow-app>
--- a/pom.xml
+++ b/pom.xml
@ -797,7 +797,7 @@
 		<mockito-core.version>3.3.3</mockito-core.version>
 		<mongodb.driver.version>3.4.2</mongodb.driver.version>
 		<vtd.version>[2.12,3.0)</vtd.version>
-		<dhp-schemas.version>[2.10.24]</dhp-schemas.version>
+		<dhp-schemas.version>[2.10.26-SNAPSHOT]</dhp-schemas.version>
 		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
 		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
 		<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>