entitiesPath the input entity path workingPath path for the working directory numPartitions number of partitions for the spark files ldaInferencePath the output path to store the inference result vocabularyPath location of the vocabulary ldaModelPath location of the LDA model authorsPath location of the authors sparkDriverMemory memory for driver process sparkExecutorMemory memory for individual executor sparkExecutorCores number of cores used by single executor oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* spark2ExtraListeners com.cloudera.spark.lineage.NavigatorAppListener spark 2.* extra listeners classname spark2SqlQueryExecutionListeners com.cloudera.spark.lineage.NavigatorQueryListener spark 2.* sql query execution listeners classname spark2YarnHistoryServerAddress spark 2.* yarn history server address spark2EventLogDir spark 2.* event log dir location

${jobTracker}

${nameNode}

mapreduce.job.queuename ${queueName} oozie.launcher.mapred.job.queue.name ${oozieLauncherQueueName} oozie.action.sharelib.for.spark ${oozieActionShareLibForSpark2} Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] yarn cluster Tokenize Data eu.dnetlib.jobs.SparkTokenizer dnet-and-test-${projectVersion}.jar

--num-executors=32 --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --conf spark.dynamicAllocation.enabled=false

--entitiesPath${entitiesPath} --inputFieldJPath${inputFieldJPath} --workingPath${workingPath} --numPartitions${numPartitions} yarn cluster Create Count Vectors eu.dnetlib.jobs.SparkCountVectorizer dnet-and-test-${projectVersion}.jar

--executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840

--vocabularyPath${vocabularyPath} --workingPath${workingPath} --numPartitions${numPartitions} yarn cluster LDA Inference eu.dnetlib.jobs.featureextraction.lda.SparkLDAInference dnet-and-test-${projectVersion}.jar

--workingPath${workingPath} --outputPath${ldaInferencePath} --ldaModelPath${ldaModelPath} --numPartitions${numPartitions}