sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
inputPathCrossref
the Crossref input path
crossrefDumpPath
the Crossref dump path
MAGDumpPath
the MAG dump working path
inputPathMAG
the MAG working path
inputPathOrcid
the ORCID input path
workingPathOrcid
the ORCID working path
${jobTracker}
${nameNode}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
${wf:conf('resumeFrom') eq 'UnpackCrossrefEntries'}
${wf:conf('resumeFrom') eq 'GenerateCrossrefDataset'}
${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}
${wf:conf('resumeFrom') eq 'ConvertMagToDataset'}
${wf:conf('resumeFrom') eq 'PreProcessORCID'}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
${jobTracker}
${nameNode}
eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords
--hdfsServerUri${nameNode}
--crossrefFileNameTarGz${crossrefDumpPath}/crossref.tar.gz
--workingPath${crossrefDumpPath}
--outputPath${crossrefDumpPath}/files/
yarn-cluster
cluster
SparkUnpackCrossrefEntries
eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--masteryarn-cluster
--sourcePath${crossrefDumpPath}/files
--targetPath${crossrefDumpPath}/crossref_unpack/
yarn-cluster
cluster
SparkGenerateCrossrefDataset
eu.dnetlib.doiboost.crossref.GenerateCrossrefDataset
dhp-doiboost-${projectVersion}.jar
--executor-memory=7G
--executor-cores=2
--driver-memory=7G
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--masteryarn-cluster
--sourcePath${crossrefDumpPath}/crossref_unpack/
--targetPath${inputPathCrossref}/crossref_ds
yarn-cluster
cluster
Convert Mag to Dataset
eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${MAGDumpPath}
--targetPath${inputPathMAG}/dataset
--masteryarn-cluster
yarn-cluster
cluster
Convert ORCID to Dataset
eu.dnetlib.doiboost.orcid.SparkPreprocessORCID
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${inputPathOrcid}
--workingPath${workingPathOrcid}
--masteryarn-cluster