inputGraphRootPath
root location of input materialized graph
isLookupUrl
URL for the isLookup service
contextApiBaseUrl
context API URL
relPartitions
number or partitions for the relations Dataset
relationFilter
filter applied reading relations (by relClass)
sourceMaxRelations
maximum number of relations allowed for a each entity grouping by source
targetMaxRelations
maximum number of relations allowed for a each entity grouping by target
format
metadata format name (DMF|TMF)
batchSize
number of records to be included in each indexing request
solrDeletionQuery
*:*
query used in the deleted by query operation
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
sparkDriverMemoryForJoining
memory for driver process
sparkExecutorMemoryForJoining
memory for individual executor
sparkExecutorCoresForJoining
number of cores used by single executor
sparkDriverMemoryForIndexing
memory for driver process
sparkExecutorMemoryForIndexing
memory for individual executor
sparkExecutorCoresForIndexing
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
sparkNetworkTimeout
configures spark.network.timeout
${jobTracker}
${nameNode}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
${wf:conf('resumeFrom') eq 'prepare_relations'}
${wf:conf('resumeFrom') eq 'fork_join_related_entities'}
${wf:conf('resumeFrom') eq 'fork_join_all_entities'}
${wf:conf('resumeFrom') eq 'convert_to_xml'}
${wf:conf('resumeFrom') eq 'drop_solr_collection'}
${wf:conf('resumeFrom') eq 'to_solr_index'}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn
cluster
PrepareRelations
eu.dnetlib.dhp.oa.provision.PrepareRelationsJob
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--inputRelationsPath${inputGraphRootPath}/relation
--outputPath${workingDir}/relation
--sourceMaxRelations${sourceMaxRelations}
--targetMaxRelations${targetMaxRelations}
--relationFilter${relationFilter}
--relPartitions5000
yarn
cluster
Join[relation.target = publication.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/publication
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
--outputPath${workingDir}/join_partial/publication
yarn
cluster
Join[relation.target = dataset.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/dataset
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
--outputPath${workingDir}/join_partial/dataset
yarn
cluster
Join[relation.target = otherresearchproduct.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/otherresearchproduct
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--outputPath${workingDir}/join_partial/otherresearchproduct
yarn
cluster
Join[relation.target = software.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/software
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
--outputPath${workingDir}/join_partial/software
yarn
cluster
Join[relation.target = datasource.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/datasource
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
--outputPath${workingDir}/join_partial/datasource
yarn
cluster
Join[relation.target = organization.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/organization
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
--outputPath${workingDir}/join_partial/organization
yarn
cluster
Join[relation.target = project.id]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputRelationsPath${workingDir}/relation
--inputEntityPath${inputGraphRootPath}/project
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project
--outputPath${workingDir}/join_partial/project
yarn
cluster
Join[publication.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/publication
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/publication
--numPartitions30000
yarn
cluster
Join[dataset.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/dataset
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/dataset
--numPartitions20000
yarn
cluster
Join[otherresearchproduct.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/otherresearchproduct
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/otherresearchproduct
--numPartitions10000
yarn
cluster
Join[software.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/software
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/software
--numPartitions10000
yarn
cluster
Join[datasource.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=8000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/datasource
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/datasource
--numPartitions1000
yarn
cluster
Join[organization.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/organization
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/organization
--numPartitions20000
yarn
cluster
Join[project.id = relatedEntity.source]
eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputEntityPath${inputGraphRootPath}/project
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project
--inputRelatedEntitiesPath${workingDir}/join_partial
--outputPath${workingDir}/join_entities/project
--numPartitions10000
yarn
cluster
convert_to_xml
eu.dnetlib.dhp.oa.provision.XmlConverterJob
dhp-graph-provision-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
--inputPath/user/claudio.atzori/data/provision/join_entities
--outputPath/user/claudio.atzori/data/provision/xml_json_test
--contextApiBaseUrl${contextApiBaseUrl}
--isLookupUrl${isLookupUrl}
oozie.launcher.mapreduce.user.classpath.first
true
eu.dnetlib.dhp.oa.provision.SolrAdminApplication
--isLookupUrl${isLookupUrl}
--format${format}
--actionDELETE_BY_QUERY
--query${solrDeletionQuery}
--committrue
--shouldIndex${shouldIndex}
yarn
cluster
to_solr_index
eu.dnetlib.dhp.oa.provision.XmlIndexingJob
dhp-graph-provision-${projectVersion}.jar
--executor-memory=${sparkExecutorMemoryForIndexing}
--driver-memory=${sparkDriverMemoryForIndexing}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
--inputPath/user/claudio.atzori/data/provision/xml_json_test
--isLookupUrl${isLookupUrl}
--format${format}
--batchSize${batchSize}
--outputFormat${outputFormat}
--outputPath/user/claudio.atzori/data/provision/solr_documents
--shouldIndex${shouldIndex}
oozie.launcher.mapreduce.user.classpath.first
true
eu.dnetlib.dhp.oa.provision.SolrAdminApplication
--isLookupUrl${isLookupUrl}
--format${format}
--actionCOMMIT
--shouldIndex${shouldIndex}