diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index e98cbbc73..0d5121cf1 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -97,7 +97,18 @@
-
+
+
+
+
+ ${wf:conf('resumeFrom') eq 'prepare_relations'}
+ ${wf:conf('resumeFrom') eq 'fork_join_related_entities'}
+ ${wf:conf('resumeFrom') eq 'fork_join_all_entities'}
+ ${wf:conf('resumeFrom') eq 'convert_to_xml'}
+ ${wf:conf('resumeFrom') eq 'to_solr_index'}
+
+
+
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
@@ -124,12 +135,475 @@
--outputPath${workingDir}/relation
--relPartitions5000
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = publication.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=7680
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/publication
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${workingDir}/join_partial/publication
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = dataset.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/dataset
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${workingDir}/join_partial/dataset
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = otherresearchproduct.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/otherresearchproduct
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${workingDir}/join_partial/otherresearchproduct
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = software.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/software
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${workingDir}/join_partial/software
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = datasource.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/datasource
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
+ --outputPath${workingDir}/join_partial/datasource
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = organization.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/organization
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
+ --outputPath${workingDir}/join_partial/organization
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[relation.target = project.id]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputRelationsPath${workingDir}/relation
+ --inputEntityPath${inputGraphRootPath}/project
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project
+ --outputPath${workingDir}/join_partial/project
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[publication.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=15360
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/publication
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/publication
+ --numPartitions30000
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[dataset.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=7680
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/dataset
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/dataset
+ --numPartitions20000
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[otherresearchproduct.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=7680
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/otherresearchproduct
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/otherresearchproduct
+ --numPartitions10000
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[software.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/software
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/software
+ --numPartitions10000
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[datasource.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=7680
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/datasource
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/datasource
+ --numPartitions1000
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[organization.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=7680
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/organization
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/organization
+ --numPartitions20000
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Join[project.id = relatedEntity.source]
+ eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputEntityPath${inputGraphRootPath}/project
+ --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project
+ --inputRelatedEntitiesPath${workingDir}/join_partial
+ --outputPath${workingDir}/join_entities/project
+ --numPartitions10000
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ convert_to_xml
+ eu.dnetlib.dhp.oa.provision.XmlConverterJob
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCoresForJoining}
+ --executor-memory=${sparkExecutorMemoryForJoining}
+ --driver-memory=${sparkDriverMemoryForJoining}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.network.timeout=${sparkNetworkTimeout}
+
+ --inputPath${workingDir}/join_entities
+ --outputPath${workingDir}/xml
+ --isLookupUrl${isLookupUrl}
+ --otherDsTypeId${otherDsTypeId}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ to_solr_index
+ eu.dnetlib.dhp.oa.provision.XmlIndexingJob
+ dhp-graph-provision-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemoryForIndexing}
+ --driver-memory=${sparkDriverMemoryForIndexing}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+
+ --inputPath${workingDir}/xml
+ --isLookupUrl${isLookupUrl}
+ --format${format}
+ --batchSize${batchSize}
+
-
-
\ No newline at end of file