sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorIntersectionMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
inputPathCrossref
the Crossref input path
crossrefTimestamp
Timestamp for the Crossref incremental Harvesting
esServer
elasticsearch server url for the Crossref Harvesting
esIndex
elasticsearch index name for the Crossref Harvesting
MAGDumpPath
the MAG dump working path
inputPathMAG
the MAG working path
inputPathOrcid
the ORCID input path
workingPathOrcid
the ORCID working path
${jobTracker}
${nameNode}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
eu.dnetlib.doiboost.crossref.CrossrefImporter
--targetPath${inputPathCrossref}/index_update
--namenode${nameNode}
--esServer${esServer}
--esIndex${esIndex}
--timestamp${crossrefTimestamp}
yarn-cluster
cluster
GenerateCrossrefDataset
eu.dnetlib.doiboost.crossref.CrossrefDataset
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--workingPath${inputPathCrossref}
--masteryarn-cluster
yarn-cluster
cluster
Convert Mag to Dataset
eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${MAGDumpPath}
--targetPath${inputPathMAG}/dataset
--masteryarn-cluster
yarn-cluster
cluster
Convert ORCID to Dataset
eu.dnetlib.doiboost.orcid.SparkPreprocessORCID
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${inputPathOrcid}
--workingPath${workingPathOrcid}
--masteryarn-cluster