From 909729a2fc2ce615f6a6f16182e99a9c656b6425 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 May 2023 10:16:22 +0200 Subject: [PATCH] [dedup] tweaking num partitions, minor changes --- .../dhp/oa/dedup/SparkCreateMergeRels.java | 4 ++-- .../dhp/oa/dedup/SparkWhitelistSimRels.java | 4 ++-- .../dhp/oa/dedup/scan/oozie_app/workflow.xml | 20 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 95e3dff28..2f551b244 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -59,7 +59,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction { ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - SparkCreateSimRels.class + SparkCreateMergeRels.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); parser.parseArgument(args); @@ -99,7 +99,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final String subEntity = dedupConf.getWf().getSubEntityValue(); final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); - log.info("Creating mergerels for: '{}'", subEntity); + log.info("Creating merge rels for: '{}'", subEntity); final int maxIterations = dedupConf.getWf().getMaxIterations(); log.info("Max iterations {}", maxIterations); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java index 1cfac9a27..72c10e2a6 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java @@ -31,7 +31,7 @@ import scala.Tuple2; public class SparkWhitelistSimRels extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + private static final Logger log = LoggerFactory.getLogger(SparkWhitelistSimRels.class); private static final String WHITELIST_SEPARATOR = "####"; @@ -43,7 +43,7 @@ public class SparkWhitelistSimRels extends AbstractSparkAction { ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - SparkCreateSimRels.class + SparkWhitelistSimRels.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json"))); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml index 02fdd8431..ba2270c8a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml @@ -126,13 +126,13 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 --graphBasePath${graphBasePath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --workingPath${workingPath} - --numPartitions8000 + --numPartitions15000 @@ -153,14 +153,14 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 --graphBasePath${graphBasePath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --workingPath${workingPath} --whiteListPath${whiteListPath} - --numPartitions8000 + --numPartitions15000 @@ -181,7 +181,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 --graphBasePath${graphBasePath} --workingPath${workingPath} @@ -208,7 +208,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 --graphBasePath${graphBasePath} --workingPath${workingPath} @@ -235,13 +235,13 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 --graphBasePath${graphBasePath} --workingPath${workingPath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetIdOpenorgs} - --numPartitions8000 + --numPartitions15000 @@ -288,7 +288,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=10000 --graphBasePath${graphBasePath} --workingPath${workingPath} @@ -314,7 +314,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=10000 --graphBasePath${graphBasePath} --workingPath${workingPath}