sourcePath
the source path
entity
the entity that should be processed
dedupConf
the dedup Configuration
targetPath
the target path
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
${jobTracker}
${nameNode}
yarn-cluster
cluster
Create Similarity Relations
eu.dnetlib.dedup.SparkCreateSimRels
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--num-executors 100
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
-mtyarn-cluster
--sourcePath${sourcePath}
--targetPath${targetPath}
--entity${entity}
--dedupConf${dedupConf}
${jobTracker}
${nameNode}
yarn-cluster
cluster
Create Connected Components
eu.dnetlib.dedup.SparkCreateConnectedComponent
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--num-executors 100
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
-mtyarn-cluster
--sourcePath${sourcePath}
--targetPath${targetPath}
--entity${entity}
--dedupConf${dedupConf}
${jobTracker}
${nameNode}
yarn-cluster
cluster
Create Dedup Record
eu.dnetlib.dedup.SparkCreateDedupRecord
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--num-executors 100
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
-mtyarn-cluster
--sourcePath${sourcePath}
--dedupPath${targetPath}
--entity${entity}
--dedupConf${dedupConf}
${jobTracker}
${nameNode}
yarn-cluster
cluster
Propagate Dedup Relations
eu.dnetlib.dedup.SparkPropagateRelationsJob
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--num-executors 100
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
-mtyarn-cluster
--mergeRelPath${targetPath}/${entity}/mergeRel
--relationPath${sourcePath}/relation
--targetRelPath${targetPath}/${entity}/relation_updated