graphOutputPath
the target path to store raw graph
reuseContent
false
should import content from the aggregator or reuse a previous version
postgresURL
the postgres URL to access to the database
postgresUser
the user postgres
postgresPassword
the password postgres
mongoURL
mongoDB url, example: mongodb://[username:password@]host[:port]
mongoDb
mongo database
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
${wf:conf('reuseContent') eq false}
${wf:conf('reuseContent') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication
-p${workingDir}/db_claims
-pgurl${postgresURL}
-pguser${postgresUser}
-pgpasswd${postgresPassword}
-aclaims
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
-p${workingDir}/odf_claims
-mongourl${mongoURL}
-mongodb${mongoDb}
-fODF
-lstore
-iclaim
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
-p${workingDir}/oaf_claims
-mongourl${mongoURL}
-mongodb${mongoDb}
-fOAF
-lstore
-iclaim
eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication
-p${workingDir}/db_records
-pgurl${postgresURL}
-pguser${postgresUser}
-pgpasswd${postgresPassword}
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
-p${workingDir}/odf_records
-mongourl${mongoURL}
-mongodb${mongoDb}
-fODF
-lstore
-icleaned
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
-p${workingDir}/oaf_records
-mongourl${mongoURL}
-mongodb${mongoDb}
-fOAF
-lstore
-icleaned
yarn
cluster
GenerateEntities_claim
eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-s${workingDir}/db_claims,${workingDir}/oaf_claims,${workingDir}/odf_claims
-t${workingDir}/entities_claim
-pgurl${postgresURL}
-pguser${postgresUser}
-pgpasswd${postgresPassword}
yarn
cluster
GenerateGraph_claims
eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-s${workingDir}/entities_claim
-g${workingDir}/graph_claims
yarn
cluster
GenerateEntities
eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-s${workingDir}/db_records,${workingDir}/oaf_records,${workingDir}/odf_records
-t${workingDir}/entities
-pgurl${postgresURL}
-pguser${postgresUser}
-pgpasswd${postgresPassword}
yarn
cluster
GenerateGraph
eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
-s${workingDir}/entities
-g${workingDir}/graph_raw
yarn
cluster
MergeClaims_publication
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
yarn
cluster
MergeClaims_dataset
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
yarn
cluster
MergeClaims_relation
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation
yarn
cluster
MergeClaims_software
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=1920
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
yarn
cluster
MergeClaims_otherresearchproduct
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=1920
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
yarn
cluster
MergeClaims_datasource
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=200
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
yarn
cluster
MergeClaims_organization
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=200
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
yarn
cluster
MergeClaims_project
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=200
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project