graphBasePath
the raw graph base path
isLookUpUrl
the address of the lookUp service
actionSetId
id of the actionSet
workingPath
path for the working directory
cutConnectedComponent
max number of elements in a connected component
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn
cluster
Create Similarity Relations
eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphBasePath${graphBasePath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetIdOpenorgs}
--workingPath${workingPath}
--numPartitions1000
yarn
cluster
Copy OpenOrgs Sim Rels
eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsSimRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphBasePath${graphBasePath}
--isLookUpUrl${isLookUpUrl}
--workingPath${workingPath}
--actionSetId${actionSetIdOpenorgs}
--numPartitions1000
yarn
cluster
Create Merge Relations
eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetIdOpenorgs}
--cutConnectedComponent${cutConnectedComponent}
yarn
cluster
Prepare Organization Relations
eu.dnetlib.dhp.oa.dedup.SparkPrepareOrgRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetIdOpenorgs}
--dbUrl${dbUrl}
--dbTable${dbTable}
--dbUser${dbUser}
--dbPwd${dbPwd}
--numConnections20
yarn
cluster
Prepare New Organizations
eu.dnetlib.dhp.oa.dedup.SparkPrepareNewOrgs
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetIdOpenorgs}
--apiUrl${apiUrl}
--dbUrl${dbUrl}
--dbTable${dbTable}
--dbUser${dbUser}
--dbPwd${dbPwd}
--numConnections20