From 87f802821e217c5d59a1979ffbac68f9e01cf5c7 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Sat, 11 Apr 2020 16:40:22 +0200 Subject: [PATCH] new workflow for country propagation: it is composed of the preparation step and in the propagation. The propagation part runs in parallel on the result types --- .../countrypropagation/oozie_app/workflow.xml | 147 +++++++++--------- 1 file changed, 70 insertions(+), 77 deletions(-) diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml index 38aa93335..79789e095 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml @@ -36,14 +36,6 @@ saveGraph writes new version of the graph after the propagation step - - outputPath - the path used to store temporary output files - - - preparedInfoPath - the path where prepared info have been stored - @@ -59,17 +51,16 @@ cluster PrepareDatasourceCountryAssociation eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation - dhp-graph-propagation-${projectVersion}.jar + dhp-propagation-${projectVersion}.jar - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - -mt yarn-cluster --sourcePath${sourcePath} --whitelist${whitelist} --allowedtypes${allowedtypes} @@ -80,8 +71,8 @@ - - + + @@ -92,18 +83,19 @@ cluster countryPropagationForPublications eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob2 - dhp-graph-propagation-${projectVersion}.jar + dhp-propagation-${projectVersion}.jar - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - -mt yarn-cluster - --sourcePath${sourcePath} + --sourcePath${sourcePath}/publication --hive_metastore_uris${hive_metastore_uris} --writeUpdate${writeUpdate} --saveGraph${saveGraph} @@ -111,7 +103,7 @@ --outputPath${workingDir}/country_propagation/publication --preparedInfoPath${workingDir}/country_propagation/preparedInfo - + @@ -121,26 +113,27 @@ cluster countryPropagationForDataset eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob2 - dhp-graph-propagation-${projectVersion}.jar + dhp-propagation-${projectVersion}.jar - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - -mt yarn-cluster - --sourcePath${sourcePath} + --sourcePath${sourcePath}/dataset --hive_metastore_uris${hive_metastore_uris} --writeUpdate${writeUpdate} --saveGraph${saveGraph} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/country_propagation/publication + --outputPath${workingDir}/country_propagation/dataset --preparedInfoPath${workingDir}/country_propagation/preparedInfo - + @@ -150,26 +143,27 @@ cluster countryPropagationForORP eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob2 - dhp-graph-propagation-${projectVersion}.jar + dhp-propagation-${projectVersion}.jar - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - -mt yarn-cluster - --sourcePath${sourcePath} + --sourcePath${sourcePath}/otherresearchproduct --hive_metastore_uris${hive_metastore_uris} --writeUpdate${writeUpdate} --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Otherresearchproduct - --outputPath${workingDir}/country_propagation/publication + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/country_propagation/otherresearchproduct --preparedInfoPath${workingDir}/country_propagation/preparedInfo - + @@ -179,15 +173,47 @@ cluster countryPropagationForSoftware eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob2 - dhp-graph-propagation-${projectVersion}.jar + dhp-propagation-${projectVersion}.jar - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --writeUpdate${writeUpdate} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/country_propagation/software + --preparedInfoPath${workingDir}/country_propagation/preparedInfo + + + + + + + + yarn + cluster + countryPropagationForSoftware + eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob2 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} -mt yarn-cluster --sourcePath${sourcePath} @@ -198,42 +224,9 @@ --outputPath${workingDir}/country_propagation/publication --preparedInfoPath${workingDir}/country_propagation/preparedInfo - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - CountryPropagation - eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob - dhp-propagation-${projectVersion}.jar - - --num-executors=${sparkExecutorNumber} - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - - -mt yarn-cluster - --sourcePath${sourcePath} - --whitelist${whitelist} - --allowedtypes${allowedtypes} - --hive_metastore_uris${hive_metastore_uris} - --writeUpdate${writeUpdate} - --saveGraph${saveGraph} - - + \ No newline at end of file