sourcePath
the source path
outputPath
the output path
resultAggregation
true if all the result type have to be dumped under result. false otherwise
organizationCommunityMap
the organization community map
hiveDbName
the target hive database name
hiveJdbcUrl
hive server jdbc url
hiveMetastoreUris
hive server metastore URIs
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
yarn
cluster
Dump table publication
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/publication
--resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
--outputPath${workingDir}/result/publication
--communityMapPath${communityMapPath}
yarn
cluster
Dump table dataset
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/dataset
--resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
--outputPath${workingDir}/result/dataset
--communityMapPath${communityMapPath}
yarn
cluster
Dump table ORP
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/otherresearchproduct
--resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--outputPath${workingDir}/result/otherresearchproduct
--communityMapPath${communityMapPath}
yarn
cluster
Dump table software
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/software
--resultTableNameeu.dnetlib.dhp.schema.oaf.Software
--outputPath${workingDir}/result/software
--communityMapPath${communityMapPath}
yarn
cluster
Dump table organization
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/organization
--resultTableNameeu.dnetlib.dhp.schema.oaf.Organization
--outputPath${outputPath}/organization
--communityMapPath${communityMapPath}
yarn
cluster
Dump table project
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/project
--resultTableNameeu.dnetlib.dhp.schema.oaf.Project
--outputPath${outputPath}/project
--communityMapPath${communityMapPath}
yarn
cluster
Dump table datasource
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/datasource
--resultTableNameeu.dnetlib.dhp.schema.oaf.Datasource
--outputPath${outputPath}/datasource
--communityMapPath${workingDir}/communityMap
yarn
cluster
Select valid table relation
eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--outputPath${workingDir}/validrelation
yarn
cluster
Dump table relation
eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpRelationJob
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${workingDir}/validrelation
--outputPath${workingDir}/relation/relation
--removeSet${removeSet}
eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextEntities
--hdfsPath${outputPath}/communities_infrastructures/communities_infrastructure.json.gz
--nameNode${nameNode}
--isLookUpUrl${isLookUpUrl}
eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextRelation
--hdfsPath${workingDir}/relation/context
--nameNode${nameNode}
--isLookUpUrl${isLookUpUrl}
yarn
cluster
Dump table relation
eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/relation
--outputPath${workingDir}/relation/contextOrg
--organizationCommunityMap${organizationCommunityMap}
--communityMapPath${communityMapPath}
yarn
cluster
Extract Relations from publication
eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/publication
--resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
--outputPath${workingDir}/relation/publication
--communityMapPath${communityMapPath}
yarn
cluster
Dump table dataset
eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/dataset
--resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
--outputPath${workingDir}/relation/dataset
--communityMapPath${communityMapPath}
yarn
cluster
Dump table ORP
eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/otherresearchproduct
--resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--outputPath${workingDir}/relation/orp
--communityMapPath${communityMapPath}
yarn
cluster
Dump table software
eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/software
--resultTableNameeu.dnetlib.dhp.schema.oaf.Software
--outputPath${workingDir}/relation/software
--communityMapPath${communityMapPath}
yarn
cluster
Collect Results and Relations and put them in the right path
eu.dnetlib.dhp.oa.graph.dump.complete.SparkCollectAndSave
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${workingDir}
--outputPath${outputPath}
--resultAggregation${resultAggregation}
Sub-workflow dump complete failed with error message ${wf:errorMessage()}