From ffc7488257b83f5c188263ad65c316e9b62bf90e Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 10 Jan 2025 13:20:05 +0100 Subject: [PATCH] Changes required to run cleaning workflow with spark 3.4 --- dhp-common/pom.xml | 6 + .../graph/clean/oozie_app/config-default.xml | 4 + .../dhp/oa/graph/clean/oozie_app/workflow.xml | 134 ++++++------------ 3 files changed, 56 insertions(+), 88 deletions(-) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index bfec019af6..9ce93ff275 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -128,6 +128,12 @@ eu.dnetlib cnr-rmi-api + + + log4j + log4j + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/config-default.xml index 2e0ed9aeea..ee84547f0a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/config-default.xml @@ -11,6 +11,10 @@ oozie.use.system.libpath true + + oozie.launcher.mapreduce.user.classpath.first + true + oozie.action.sharelib.for.spark spark2 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 01aaadae5b..0ecde24829 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -81,6 +81,22 @@ spark2EventLogDir spark 2.* event log dir location + + + sparkClusterOpts + --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory + spark cluster-wide options + + + sparkResourceOpts + --executor-memory=8G --conf spark.executor.memoryOverhead=6G --executor-cores=6 --driver-memory=9G --driver-cores=4 + spark resource options + + + sparkApplicationOpts + --conf spark.sql.shuffle.partitions=1000 + spark resource options + @@ -102,13 +118,9 @@ eu.dnetlib.dhp.oa.graph.clean.GetDatasourceFromCountry dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=10000 --inputPath${graphInputPath} @@ -154,15 +166,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=15000 --inputPath${graphInputPath}/publication @@ -190,15 +196,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=8000 --inputPath${graphInputPath}/dataset @@ -226,15 +226,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=5000 --inputPath${graphInputPath}/otherresearchproduct @@ -262,15 +256,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=2000 --inputPath${graphInputPath}/software @@ -298,15 +286,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=1000 --inputPath${graphInputPath}/datasource @@ -334,15 +316,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=1000 --inputPath${graphInputPath}/organization @@ -370,15 +346,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=2000 --inputPath${graphInputPath}/project @@ -406,15 +376,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=2000 --inputPath${graphInputPath}/person @@ -442,15 +406,9 @@ eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.autoBroadcastJoinThreshold=-1 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} --conf spark.sql.shuffle.partitions=20000 --inputPath${graphInputPath}/relation