entitiesPath the input entity path workingPath path for the working directory numPartitions number of partitions for the spark files inputFieldJPath json path of the input field in the entities vocabularyPath location of the vocabulary vocabularyType type of the vocabulary: file or tokens trainRatio percentage of the data to be used as training set numTopics number of topics to which test the LDA model maxIterations maximum number of iterations of the LDA algorithm outputModelPath location of the best LDA model sparkDriverMemory memory for driver process sparkExecutorMemory memory for individual executor sparkExecutorCores number of cores used by single executor oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* spark2ExtraListeners com.cloudera.spark.lineage.NavigatorAppListener spark 2.* extra listeners classname spark2SqlQueryExecutionListeners com.cloudera.spark.lineage.NavigatorQueryListener spark 2.* sql query execution listeners classname spark2YarnHistoryServerAddress spark 2.* yarn history server address spark2EventLogDir spark 2.* event log dir location

${jobTracker}

${nameNode}

mapreduce.job.queuename ${queueName} oozie.launcher.mapred.job.queue.name ${oozieLauncherQueueName} oozie.action.sharelib.for.spark ${oozieActionShareLibForSpark2} Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] yarn cluster Tokenize Data eu.dnetlib.jobs.SparkTokenizer dnet-and-test-${projectVersion}.jar

--num-executors=32 --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --conf spark.dynamicAllocation.enabled=false

--entitiesPath${entitiesPath} --inputFieldJPath${inputFieldJPath} --workingPath${workingPath} --numPartitions${numPartitions} yarn cluster Create Vocabulary eu.dnetlib.jobs.SparkCreateVocabulary dnet-and-test-${projectVersion}.jar

--vocabularyPath${vocabularyPath} --vocabularyType${vocabularyType} --workingPath${workingPath} --numPartitions${numPartitions} yarn cluster Create Count Vectors eu.dnetlib.jobs.SparkCountVectorizer dnet-and-test-${projectVersion}.jar

--executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840

--vocabularyPath${vocabularyPath} --workingPath${workingPath} --numPartitions${numPartitions} yarn cluster LDA Tuning eu.dnetlib.jobs.featureextraction.lda.SparkLDATuning dnet-and-test-${projectVersion}.jar

--trainRatio${trainRatio} --numTopics${numTopics} --maxIterations${maxIterations} --outputModelPath${outputModelPath} --workingPath${workingPath} --numPartitions${numPartitions}