spark2UpdateStepMaxExecutors 50 workingPath the working dir base path oozie.action.sharelib.for.java spark2 oozie.launcher.mapreduce.user.classpath.first true oozie.launcher.mapreduce.map.java.opts -Xmx4g token access token sparkDriverMemory 7G memory for driver process sparkExecutorMemory 2G memory for individual executor sparkExecutorCores 1 number of cores used by single executor spark2DownloadingMaxExecutors 10 oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* spark2ExtraListeners com.cloudera.spark.lineage.NavigatorAppListener spark 2.* extra listeners classname spark2SqlQueryExecutionListeners com.cloudera.spark.lineage.NavigatorQueryListener spark 2.* sql query execution listeners classname spark2YarnHistoryServerAddress spark 2.* yarn history server address spark2EventLogDir spark 2.* event log dir location

${jobTracker}

${nameNode}

oozie.action.sharelib.for.spark ${oozieActionShareLibForSpark2} Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]

${jobTracker}

${nameNode}

get_orcid_lambda_file.sh get_orcid_lambda_file.sh

yarn-cluster cluster GenLastModifiedSeq eu.dnetlib.doiboost.orcid.SparkGenLastModifiedSeq dhp-doiboost-${projectVersion}.jar

--executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

-w${workingPath}/ -n${nameNode} -flast_modified.csv.tar -olast_modified.seq -t- yarn-cluster cluster DownloadOrcidAuthors eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors dhp-doiboost-${projectVersion}.jar

--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2DownloadingMaxExecutors} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

-w${workingPath}/ -n${nameNode} -flast_modified.seq -odownloads/updated_authors -t${token} yarn-cluster cluster DownloadOrcidWorks eu.dnetlib.doiboost.orcid.SparkDownloadOrcidWorks dhp-doiboost-${projectVersion}.jar

-w${workingPath}/ -n${nameNode} -f- -odownloads/updated_works -t${token} yarn-cluster cluster UpdateOrcidAuthors eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors dhp-doiboost-${projectVersion}.jar

--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2UpdateStepMaxExecutors} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

-w${workingPath}/ -n${nameNode} -f- -o- -t- yarn-cluster cluster UpdateOrcidWorks eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks dhp-doiboost-${projectVersion}.jar

-w${workingPath}/ -n${nameNode} -f- -o- -t- ${workingPath}/orcid_dataset/new_authors/* ${workingPath}/orcid_dataset/authors ${workingPath}/orcid_dataset/new_works/* ${workingPath}/orcid_dataset/works ${workingPath}/orcid_dataset/authors/* ${workingPath}/last_orcid_dataset/authors ${workingPath}/orcid_dataset/works/* ${workingPath}/last_orcid_dataset/works