entitiesPath
the input entity path
workingPath
path for the working directory
numPartitions
number of partitions for the spark files
inputFieldJPath
json path of the input field in the entities
vocabularyPath
location of the vocabulary
vocabularyType
type of the vocabulary: file or tokens
trainRatio
percentage of the data to be used as training set
numTopics
number of topics to which test the LDA model
maxIterations
maximum number of iterations of the LDA algorithm
outputModelPath
location of the best LDA model
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn
cluster
Tokenize Data
eu.dnetlib.jobs.SparkTokenizer
dnet-and-test-${projectVersion}.jar
--num-executors=32
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.dynamicAllocation.enabled=false
--entitiesPath${entitiesPath}
--inputFieldJPath${inputFieldJPath}
--workingPath${workingPath}
--numPartitions${numPartitions}
yarn
cluster
Create Vocabulary
eu.dnetlib.jobs.SparkCreateVocabulary
dnet-and-test-${projectVersion}.jar
--num-executors=32
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.dynamicAllocation.enabled=true
--vocabularyPath${vocabularyPath}
--vocabularyType${vocabularyType}
--workingPath${workingPath}
--numPartitions${numPartitions}
yarn
cluster
Create Count Vectors
eu.dnetlib.jobs.SparkCountVectorizer
dnet-and-test-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--vocabularyPath${vocabularyPath}
--workingPath${workingPath}
--numPartitions${numPartitions}
yarn
cluster
LDA Tuning
eu.dnetlib.jobs.featureextraction.lda.SparkLDATuning
dnet-and-test-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--trainRatio${trainRatio}
--numTopics${numTopics}
--maxIterations${maxIterations}
--outputModelPath${outputModelPath}
--workingPath${workingPath}
--numPartitions${numPartitions}