graphInputPath
the path where the graph is stored
workingPath
the path where the the generated data will be stored
datasourceIdWhitelist
-
a white list (comma separeted, - for empty list) of datasource ids
datasourceTypeWhitelist
-
a white list (comma separeted, - for empty list) of datasource types
datasourceIdBlacklist
-
a black list (comma separeted, - for empty list) of datasource ids
esEventIndexName
the elasticsearch index name
esIndexHost
the elasticsearch host
maxIndexedEventsForDsAndTopic
the max number of events for each couple (ds/topic)
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn
cluster
PrepareSimpleEntititiesJob
eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
PrepareRelatedDatasourcesJob
eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasourcesJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
PrepareRelatedDatasetsJob
eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
PrepareRelatedProjectsJob
eu.dnetlib.dhp.broker.oa.PrepareRelatedProjectsJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
PrepareRelatedPublicationsJob
eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
PrepareRelatedSoftwaresJob
eu.dnetlib.dhp.broker.oa.PrepareRelatedSoftwaresJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
JoinStep0
eu.dnetlib.dhp.broker.oa.JoinStep0Job
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
JoinStep1
eu.dnetlib.dhp.broker.oa.JoinStep1Job
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
JoinStep2
eu.dnetlib.dhp.broker.oa.JoinStep2Job
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
JoinStep3
eu.dnetlib.dhp.broker.oa.JoinStep3Job
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
JoinStep4
eu.dnetlib.dhp.broker.oa.JoinStep4Job
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
PrepareGroupsJob
eu.dnetlib.dhp.broker.oa.PrepareGroupsJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}
yarn
cluster
GenerateEventsJob
eu.dnetlib.dhp.broker.oa.GenerateEventsJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--workingPath${workingPath}
--datasourceIdWhitelist${datasourceIdWhitelist}
--datasourceTypeWhitelist${datasourceTypeWhitelist}
--datasourceIdBlacklist${datasourceIdBlacklist}
yarn
cluster
IndexEventSubsetOnESJob
eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob
dhp-broker-events-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--workingPath${workingPath}
--index${esEventIndexName}
--esHost${esIndexHost}
--maxEventsForTopic${maxIndexedEventsForDsAndTopic}
yarn
cluster
GenerateStatsJob
eu.dnetlib.dhp.broker.oa.GenerateStatsJob
dhp-broker-events-${projectVersion}.jar
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphPath${graphInputPath}
--workingPath${workingPath}