diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
index 6c7f1efd7..85a9113f2 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@@ -119,7 +119,7 @@ public class CreateRelatedEntitiesJob_phase2 {
return re;
}, Encoders.bean(EntityRelEntity.class))
.write()
- .mode(SaveMode.Append)
+ .mode(SaveMode.Overwrite)
.parquet(outputPath);
}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 516821509..33b9291c4 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -98,10 +98,20 @@
--inputRelationsPath${inputGraphRootPath}/relation
--outputPath${workingDir}/relation
-
+
+
+
+
+
+
+
+
+
+
+
yarn
@@ -124,7 +134,7 @@
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
--outputPath${workingDir}/join_partial
-
+
@@ -150,15 +160,147 @@
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
--outputPath${workingDir}/join_partial
-
+
+
+
+ yarn
+ cluster
+ Join[relation.target = otherresearchproduct.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --inputRelationsPath${workingDir}/relations
+ --inputEntityPath${inputGraphRootPath}/otherresearchproduct
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${workingDir}/join_partial
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = software.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --inputRelationsPath${workingDir}/relations
+ --inputEntityPath${inputGraphRootPath}/software
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${workingDir}/join_partial
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = datasource.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --inputRelationsPath${workingDir}/relations
+ --inputEntityPath${inputGraphRootPath}/datasource
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
+ --outputPath${workingDir}/join_partial
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = organization.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --inputRelationsPath${workingDir}/relations
+ --inputEntityPath${inputGraphRootPath}/organization
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
+ --outputPath${workingDir}/join_partial
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = project.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --inputRelationsPath${workingDir}/relations
+ --inputEntityPath${inputGraphRootPath}/project
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project
+ --outputPath${workingDir}/join_partial
+
+
+
+
+
+
+
yarn
cluster
- Join[relation.target = dataset.id]
+ Join[entities.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
@@ -171,8 +313,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
- --inputRelatedEntitiesPath${workingDir}/join_partial
--inputEntityPath${inputGraphRootPath}
+ --inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities
@@ -196,7 +338,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
- --inputPath ${${workingDir}/join_entities
+ --inputPath ${workingDir}/join_entities
--outputPath${workingDir}/joined
@@ -220,7 +362,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
- --inputPath${${workingDir}/joined
+ --inputPath${workingDir}/joined
--outputPath${workingDir}/xml
--isLookupUrl${isLookupUrl}
--otherDsTypeId${otherDsTypeId}
@@ -246,8 +388,8 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --isLookupUrl ${isLookupUrl}
--inputPath${workingDir}/xml
+ --isLookupUrl ${isLookupUrl}
--format${format}
--batchSize${batchSize}