graphBasePath
the raw graph base path
isLookUpUrl
the address of the lookUp service
actionSetId
id of the actionSet
actionSetIdOpenorgs
id of the actionSet for OpenOrgs dedup
workingPath
path for the working directory
whiteListPath
path for the whitelist of similarity relations
dedupGraphPath
path for the output graph
cutConnectedComponent
max number of elements in a connected component
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
yarn
cluster
Create Similarity Relations
eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--graphBasePath${graphBasePath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetId}
--workingPath${workingPath}
--numPartitions15000
yarn
cluster
Add Whitelist Similarity Relations
eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--graphBasePath${graphBasePath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetId}
--workingPath${workingPath}
--whiteListPath${whiteListPath}
--numPartitions15000
yarn
cluster
Create Merge Relations
eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetId}
--cutConnectedComponent${cutConnectedComponent}
--hiveMetastoreUris${hiveMetastoreUris}
--pivotHistoryDatabase${pivotHistoryDatabase}
yarn
cluster
Create Dedup Record
eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetId}
yarn
cluster
Copy Openorgs Merge Relations
eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetIdOpenorgs}
--numPartitions15000
yarn
cluster
Create Organizations Dedup Records
eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--isLookUpUrl${isLookUpUrl}
--actionSetId${actionSetIdOpenorgs}
yarn
cluster
Update Entity
eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--dedupGraphPath${dedupGraphPath}
yarn
cluster
Copy Non-Openorgs Relations
eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs
dhp-dedup-openaire-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
--graphBasePath${graphBasePath}
--workingPath${workingPath}
--dedupGraphPath${dedupGraphPath}