sourcePath
the source path
outputPath
the output path
country
the country for which to produce the dump
hiveDbName
the target hive database name
hiveJdbcUrl
hive server jdbc url
hiveMetastoreUris
hive server metastore URIs
sparkDriverMemory
memory for driver process
sparkExecutorMemory
memory for individual executor
sparkExecutorCores
number of cores used by single executor
oozieActionShareLibForSpark2
oozie action sharelib for spark 2.*
spark2ExtraListeners
com.cloudera.spark.lineage.NavigatorAppListener
spark 2.* extra listeners classname
spark2SqlQueryExecutionListeners
com.cloudera.spark.lineage.NavigatorQueryListener
spark 2.* sql query execution listeners classname
spark2YarnHistoryServerAddress
spark 2.* yarn history server address
spark2EventLogDir
spark 2.* event log dir location
${jobTracker}
${nameNode}
mapreduce.job.queuename
${queueName}
oozie.launcher.mapred.job.queue.name
${oozieLauncherQueueName}
oozie.action.sharelib.for.spark
${oozieActionShareLibForSpark2}
eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap
--outputPath${workingDir}/communityMap
--nameNode${nameNode}
--isLookUpUrl${isLookUpUrl}
yarn
cluster
Dump table publication
eu.dnetlib.dhp.oa.graph.dump.country.SparkFindResultsRelatedToCountry
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--outputPath${workingDir}/resultsInCountry
--country${country}
yarn
cluster
Dump table publication
eu.dnetlib.dhp.oa.graph.dump.country.SparkFindResultWithCountry
dump-${projectVersion}.jar
--executor-memory=7G
--executor-cores=2
--driver-memory=7G
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/publication
--resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
--outputPath${outputPath}
--resultWithCountry${workingDir}/resultsInCountry
--resultTypepublication
yarn
cluster
Dump table dataset
eu.dnetlib.dhp.oa.graph.dump.country.SparkFindResultWithCountry
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/dataset
--resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
--outputPath${outputPath}
--resultTypedataset
--resultWithCountry${workingDir}/resultsInCountry
yarn
cluster
Dump table ORP
eu.dnetlib.dhp.oa.graph.dump.country.SparkFindResultWithCountry
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/otherresearchproduct
--resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--outputPath${outputPath}
--resultTypeotherresearchproduct
--resultWithCountry${workingDir}/resultsInCountry
yarn
cluster
Dump table software
eu.dnetlib.dhp.oa.graph.dump.country.SparkFindResultWithCountry
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}/software
--resultTableNameeu.dnetlib.dhp.schema.oaf.Software
--outputPath${outputPath}
--resultTypesoftware
--resultWithCountry${workingDir}/resultsInCountry
yarn
cluster
Select valid table relation
eu.dnetlib.dhp.oa.graph.dump.subset.SparkSelectSubset
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=3840
--sourcePath${sourcePath}
--outputPath${outputPath}
--removeSet${removeSet}
yarn
cluster
Dump table publication for community/funder related products
eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${outputPath}/original/publication
--resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
--outputPath${workingDir}/dump/publication
--communityMapPath${workingDir}/communityMap
--dumpTypecountry
yarn
cluster
Dump table dataset for community/funder related products
eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${outputPath}/original/dataset
--resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
--outputPath${workingDir}/dump/dataset
--communityMapPath${workingDir}/communityMap
--dumpTypecountry
yarn
cluster
Dump table ORP for community related products
eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${outputPath}/original/otherresearchproduct
--resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
--outputPath${workingDir}/dump/otherresearchproduct
--communityMapPath${workingDir}/communityMap
--dumpTypecountry
yarn
cluster
Dump table software for community related products
eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${outputPath}/original/software
--resultTableNameeu.dnetlib.dhp.schema.oaf.Software
--outputPath${workingDir}/dump/software
--communityMapPath${workingDir}/communityMap
--dumpTypecountry
yarn
cluster
Prepare association result subset of project info
eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${sourcePath}
--outputPath${workingDir}/preparedInfo
yarn
cluster
Extend dumped publications with information about project
eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${workingDir}/dump/publication
--outputPath${outputPath}/dump/publication
--preparedInfoPath${workingDir}/preparedInfo
yarn
cluster
Extend dumped dataset with information about project
eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${workingDir}/dump/dataset
--outputPath${outputPath}/dump/dataset
--preparedInfoPath${workingDir}/preparedInfo
yarn
cluster
Extend dumped ORP with information about project
eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${workingDir}/dump/otherresearchproduct
--outputPath${outputPath}/dump/otherresearchproduct
--preparedInfoPath${workingDir}/preparedInfo
yarn
cluster
Extend dumped software with information about project
eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo
dump-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--sourcePath${workingDir}/dump/software
--outputPath${outputPath}/dump/software
--preparedInfoPath${workingDir}/preparedInfo
eu.dnetlib.dhp.oa.graph.dump.MakeTar
--hdfsPath${outputPath}/tar
--nameNode${nameNode}
--sourcePath${outputPath}/dump
Sub-workflow dump complete failed with error message ${wf:errorMessage()}