graphBasePath the raw graph base path isLookUpUrl the address of the lookUp service actionSetId id of the actionSet actionSetIdOpenorgs id of the actionSet for OpenOrgs dedup workingPath path for the working directory whiteListPath path for the whitelist of similarity relations dedupGraphPath path for the output graph cutConnectedComponent max number of elements in a connected component sparkDriverMemory memory for driver process sparkExecutorMemory memory for individual executor sparkExecutorCores number of cores used by single executor oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* spark2ExtraListeners com.cloudera.spark.lineage.NavigatorAppListener spark 2.* extra listeners classname spark2SqlQueryExecutionListeners com.cloudera.spark.lineage.NavigatorQueryListener spark 2.* sql query execution listeners classname spark2YarnHistoryServerAddress spark 2.* yarn history server address spark2EventLogDir spark 2.* event log dir location ${jobTracker} ${nameNode} mapreduce.job.queuename ${queueName} oozie.launcher.mapred.job.queue.name ${oozieLauncherQueueName} oozie.action.sharelib.for.spark ${oozieActionShareLibForSpark2} Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] yarn cluster Create Similarity Relations eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --workingPath${workingPath} --numPartitions8000 yarn cluster Add Whitelist Similarity Relations eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --workingPath${workingPath} --whiteListPath${whiteListPath} --numPartitions8000 yarn cluster Create Merge Relations eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --workingPath${workingPath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --cutConnectedComponent${cutConnectedComponent} yarn cluster Create Dedup Record eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --workingPath${workingPath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} yarn cluster Copy Openorgs Merge Relations eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --workingPath${workingPath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetIdOpenorgs} --numPartitions8000 yarn cluster Create Organizations Dedup Records eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --workingPath${workingPath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetIdOpenorgs} yarn cluster Update Entity eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --workingPath${workingPath} --dedupGraphPath${dedupGraphPath} yarn cluster Copy Non-Openorgs Relations eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs dhp-dedup-openaire-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --graphBasePath${graphBasePath} --workingPath${workingPath} --dedupGraphPath${dedupGraphPath}