sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
crossrefdumpfilename
the Crossref input path
crossrefDumpPath
the Crossref dump path
crossrefdumptoken
the token for the API dump path
${jobTracker}
${nameNode}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
${jobTracker}
${nameNode}
mapred.job.queue.name
${queueName}
download.sh
${url}
${crossrefDumpPath}
${crossrefdumpfilename}
${crossrefdumptoken}
HADOOP_USER_NAME=${wf:user()}
download.sh
${jobTracker}
${nameNode}
eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords
--hdfsServerUri${nameNode}
--crossrefFileNameTarGz${crossrefdumpfilename}
--workingPath${crossrefDumpPath}
--outputPath${crossrefDumpPath}/files/
yarn-cluster
cluster
SparkUnpackCrossrefEntries
eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--masteryarn-cluster
--sourcePath${crossrefDumpPath}/files
--targetPath${crossrefDumpPath}/crossref_unpack/