workingPath /data/orcid_activities_2020 path where the collection workflow stores the ORCID data outputPath /data/orcid_activities_2020/no_doi_dataset_prod/ path where to store the action set processOutputFolder process_no_doi_dataset_prod temporary path where to store the action set spark2GenNoDoiDatasetMaxExecutors 40 sparkDriverMemory memory for driver process spark2GenNoDoiDatasetExecutorMemory 2G memory for individual executor oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* spark2ExtraListeners com.cloudera.spark.lineage.NavigatorAppListener spark 2.* extra listeners classname spark2SqlQueryExecutionListeners com.cloudera.spark.lineage.NavigatorQueryListener spark 2.* sql query execution listeners classname spark2YarnHistoryServerAddress spark 2.* yarn history server address spark2EventLogDir spark 2.* event log dir location

${jobTracker}

${nameNode}

oozie.action.sharelib.for.spark ${oozieActionShareLibForSpark2} Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] yarn-cluster cluster GenOrcidNoDoiDataset eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks dhp-doiboost-${projectVersion}.jar

--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2GenNoDoiDatasetMaxExecutors} --executor-memory=${spark2GenNoDoiDatasetExecutorMemory} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

--workingPath${workingPath}/ --hdfsServerUri${nameNode} --orcidDataFolderlast_orcid_dataset --outputEnrichedWorksPath${processOutputFolder} ${workingPath}/${processOutputFolder}/* ${outputPath}