graphOutputPath
the target path to store raw graph
reuseDBClaims
false
should import content from the aggregator or reuse a previous version
reuseODFClaims
false
should import content from the aggregator or reuse a previous version
reuseOAFClaims
false
should import content from the aggregator or reuse a previous version
reuseDB
false
should import content from the aggregator or reuse a previous version
reuseDBOpenorgs
false
should import content from the aggregator or reuse a previous version
reuseODF
false
should import content from the aggregator or reuse a previous version
reuseOAF
false
should import content from the aggregator or reuse a previous version
reuseODF_hdfs
false
should import content from the aggregator or reuse a previous version
reuseOAF_hdfs
false
should import content from the aggregator or reuse a previous version
contentPath
path location to store (or reuse) content from the aggregator
postgresURL
the postgres URL to access to the database
postgresUser
the user postgres
postgresPassword
the password postgres
postgresOpenOrgsURL
the postgres URL to access to the OpenOrgs database
postgresOpenOrgsUser
the user of OpenOrgs database
postgresOpenOrgsPassword
the password of OpenOrgs database
dbSchema
beta
the database schema according to the D-Net infrastructure (beta or production)
mongoURL
mongoDB url, example: mongodb://[username:password@]host[:port]
mongoDb
mongo database
isLookupUrl
the address of the lookUp service
nsPrefixBlacklist
a blacklist of nsprefixes (comma separeted)
shouldPatchRelations
false
activates the relation patching phase, driven by the content in ${idMappingPath}
idMappingPath
path pointing to the relations identifiers mapping dataset
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
${wf:conf('reuseDBClaims') eq false}
${wf:conf('reuseDBClaims') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication
--hdfsPath${contentPath}/db_claims
--postgresUrl${postgresURL}
--postgresUser${postgresUser}
--postgresPassword${postgresPassword}
--isLookupUrl${isLookupUrl}
--actionclaims
--dbschema${dbSchema}
--nsPrefixBlacklist${nsPrefixBlacklist}
${wf:conf('reuseODFClaims') eq false}
${wf:conf('reuseODFClaims') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
-p${contentPath}/odf_claims
-mongourl${mongoURL}
-mongodb${mongoDb}
-fODF
-lstore
-iclaim
${wf:conf('reuseOAFClaims') eq false}
${wf:conf('reuseOAFClaims') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
-p${contentPath}/oaf_claims
-mongourl${mongoURL}
-mongodb${mongoDb}
-fOAF
-lstore
-iclaim
${wf:conf('reuseDB') eq false}
${wf:conf('reuseDB') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication
--hdfsPath${contentPath}/db_openaire
--postgresUrl${postgresURL}
--postgresUser${postgresUser}
--postgresPassword${postgresPassword}
--isLookupUrl${isLookupUrl}
--actionopenaire
--dbschema${dbSchema}
--nsPrefixBlacklist${nsPrefixBlacklist}
${wf:conf('reuseODF') eq false}
${wf:conf('reuseODF') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
--hdfsPath${contentPath}/odf_records
--mongoBaseUrl${mongoURL}
--mongoDb${mongoDb}
--mdFormatODF
--mdLayoutstore
--mdInterpretationcleaned
${wf:conf('reuseOAF') eq false}
${wf:conf('reuseOAF') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
--hdfsPath${contentPath}/oaf_records
--mongoBaseUrl${mongoURL}
--mongoDb${mongoDb}
--mdFormatOAF
--mdLayoutstore
--mdInterpretationcleaned
eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication
--hdfsPath${contentPath}/oaf_records_invisible
--mongoBaseUrl${mongoURL}
--mongoDb${mongoDb}
--mdFormatOAF
--mdLayoutstore
--mdInterpretationintersection
${wf:conf('reuseODF_hdfs') eq false}
${wf:conf('reuseODF_hdfs') eq true}
yarn
cluster
ImportODF_hdfs
eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--hdfsPath${contentPath}/odf_records_hdfs
--mdstoreManagerUrl${mdstoreManagerUrl}
--mdFormatODF
--mdLayoutstore
--mdInterpretationcleaned
${wf:conf('reuseOAF_hdfs') eq false}
${wf:conf('reuseOAF_hdfs') eq true}
yarn
cluster
ImportOAF_hdfs
eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--hdfsPath${contentPath}/oaf_records_hdfs
--mdstoreManagerUrl${mdstoreManagerUrl}
--mdFormatOAF
--mdLayoutstore
--mdInterpretationcleaned
${wf:conf('reuseDBOpenorgs') eq false}
${wf:conf('reuseDBOpenorgs') eq true}
eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication
--hdfsPath${contentPath}/db_openorgs
--postgresUrl${postgresOpenOrgsURL}
--postgresUser${postgresOpenOrgsUser}
--postgresPassword${postgresOpenOrgsPassword}
--isLookupUrl${isLookupUrl}
--actionopenorgs
--dbschema${dbSchema}
--nsPrefixBlacklist${nsPrefixBlacklist}
yarn
cluster
GenerateEntities_claim
eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePaths${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims,${contentPath}/oaf_records_invisible
--targetPath${workingDir}/entities_claim
--isLookupUrl${isLookupUrl}
--shouldHashId${shouldHashId}
--modeclaim
yarn
cluster
GenerateGraph_claims
eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePath${workingDir}/entities_claim
--graphRawPath${workingDir}/graph_claims
yarn
cluster
GenerateEntities
eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs
--targetPath${workingDir}/entities
--isLookupUrl${isLookupUrl}
--shouldHashId${shouldHashId}
yarn
cluster
GenerateGraph
eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--sourcePath${workingDir}/entities
--graphRawPath${workingDir}/graph_raw
yarn
cluster
MergeClaims_publication
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication
yarn
cluster
MergeClaims_dataset
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset
yarn
cluster
MergeClaims_relation
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation
yarn
cluster
MergeClaims_software
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=1920
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software
yarn
cluster
MergeClaims_otherresearchproduct
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=1920
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
yarn
cluster
MergeClaims_datasource
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=200
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource
yarn
cluster
MergeClaims_organization
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=200
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization
yarn
cluster
MergeClaims_project
eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=200
--rawGraphPath${workingDir}/graph_raw
--claimsGraphPath${workingDir}/graph_claims
--outputRawGaphPath${graphOutputPath}
--graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project
${(shouldPatchRelations eq "true") and
(fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
yarn
cluster
PatchRelations
eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication
dhp-graph-mapper-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--graphBasePath${graphOutputPath}
--workingDir${workingDir}/patch_relations
--idMappingPath${idMappingPath}