sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorIntersectionMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
workingPath
the working Path
hostedByMapPath
the hostedByMap Path
openaireOrganizationPath
the OpenAire Organizations Path
outputPath
the Path of the sequence file action set
inputPathCrossref
the Crossref input path
inputPathMAG
the MAG input path
inputPathUnpayWall
the UnpayWall working path
workingPathOrcid
the ORCID working path
${jobTracker}
${nameNode}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
${wf:conf('resumeFrom') eq 'Skip'}
${wf:conf('resumeFrom') eq 'PreprocessMag'}
${wf:conf('resumeFrom') eq 'PreprocessUW'}
${wf:conf('resumeFrom') eq 'ProcessORCID'}
${wf:conf('resumeFrom') eq 'CreateDOIBoost'}
${wf:conf('resumeFrom') eq 'GenerateActionSet'}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn-cluster
cluster
ConvertCrossrefToOAF
eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${inputPathCrossref}/crossref_ds
--targetPath${workingPath}
--masteryarn-cluster
yarn-cluster
cluster
Convert Mag to OAF Dataset
eu.dnetlib.doiboost.mag.SparkProcessMAG
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${inputPathMAG}/dataset
--workingPath${workingPath}/MAG
--targetPath${workingPath}
--masteryarn-cluster
yarn-cluster
cluster
Convert UnpayWall to Dataset
eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${inputPathUnpayWall}/uw_extracted
--targetPath${workingPath}/uwPublication
--masteryarn-cluster
yarn-cluster
cluster
Convert ORCID to Dataset
eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--workingPath${workingPathOrcid}
--targetPath${workingPath}/orcidPublication
--masteryarn-cluster
yarn-cluster
cluster
Create DOIBoost Infospace
eu.dnetlib.doiboost.SparkGenerateDoiBoost
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--hostedByMapPath${hostedByMapPath}
--openaireOrganizationPath${openaireOrganizationPath}
--affiliationPath${inputPathMAG}/dataset/Affiliations
--paperAffiliationPath${inputPathMAG}/dataset/PaperAuthorAffiliations
--workingPath${workingPath}
--masteryarn-cluster
yarn-cluster
cluster
Generate DOIBoost ActionSet
eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet
dhp-doiboost-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--dbPublicationPath${workingPath}/doiBoostPublicationFiltered
--dbDatasetPath${workingPath}/crossrefDataset
--crossRefRelation${workingPath}/crossrefRelation
--dbaffiliationRelationPath${workingPath}/doiBoostPublicationAffiliation
--dbOrganizationPath${workingPath}/doiBoostOrganization
--targetPath${workingPath}/actionDataSet
--sFilePath${outputPath}
--masteryarn-cluster