sourcePath
the source path
outputPath
the output path
communities
the communities whose products should be dumped
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities
--outputPath${outputPath}/community
--nameNode${nameNode}
--communities${communities}
yarn
cluster
select results ids connected to communities and dump relation
eu.dnetlib.dhp.oa.graph.dump.csv.SparkSelectResultsAndDumpRelations
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--workingPath${outputPath}/workingDir
--outputPath${outputPath}
--communities${communities}
yarn
cluster
select results from publication
eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults
dump-${projectVersion}.jar
--executor-memory=9G
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=3840
--sourcePath${sourcePath}
--resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
--workingPath${outputPath}/workingDir
--resultTypepublication
yarn
cluster
select results from dataset
eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
--workingPath${outputPath}/workingDir
--resultTypedataset
yarn
cluster
select results from other
eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--workingPath${outputPath}/workingDir
--resultTypeotherresearchproduct
yarn
cluster
select results from software
eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--resultTableNameeu.dnetlib.dhp.schema.oaf.Software
--workingPath${outputPath}/workingDir
--resultTypesoftware
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn
cluster
Dump single results
eu.dnetlib.dhp.oa.graph.dump.csv.SparkMoveOnSigleDir
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--workingPath${outputPath}/workingDir
--outputPath${outputPath}
eu.dnetlib.dhp.oa.graph.dump.MakeTar
--hdfsPath${outputPath}
--nameNode${nameNode}
--sourcePath${workingDir}/tar
eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS
--hdfsPath${outputPath}
--nameNode${nameNode}
--accessToken${accessToken}
--connectionUrl${connectionUrl}
--metadata${metadata}
--conceptRecordId${conceptRecordId}
--depositionType${depositionType}
--depositionId${depositionId}