From 50d6a2ad3c2f6fe19a52ff279749bd5633917fd6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:53:37 +0200 Subject: [PATCH] added output directory removal in the blacklist spark actions; included common global properties in blacklist's workflow.xml --- .../blacklist/PrepareMergedRelationJob.java | 7 ++++ .../SparkRemoveBlacklistedRelationJob.java | 7 +++- .../dhp/blacklist/oozie_app/workflow.xml | 37 +++++++++++-------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java index 0ef59e8c2..2a46043e2 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Optional; +import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -56,6 +57,7 @@ public class PrepareMergedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); selectMergesRelations( spark, inputPath, @@ -84,4 +86,9 @@ public class PrepareMergedRelationJob { (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), Encoders.bean(Relation.class)); } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index 86587bfc9..fe4310217 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Objects; import java.util.Optional; +import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -62,6 +63,7 @@ public class SparkRemoveBlacklistedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); removeBlacklistedRelations( spark, blacklistPath, @@ -69,7 +71,6 @@ public class SparkRemoveBlacklistedRelationJob { outputPath, mergesPath); }); - } private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, @@ -144,4 +145,8 @@ public class SparkRemoveBlacklistedRelationJob { Encoders.bean(Relation.class)); } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml index 1538318c1..dd7827da4 100644 --- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml @@ -22,6 +22,25 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -49,8 +68,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -60,8 +77,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -71,8 +86,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -82,8 +95,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software @@ -93,8 +104,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -104,8 +113,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -115,8 +122,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -128,8 +133,6 @@ - ${jobTracker} - ${nameNode} eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB --hdfsPath${workingDir}/blacklist --hdfsNameNode${nameNode} @@ -156,6 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${workingDir}/mergesRelation @@ -180,6 +184,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${outputPath}/relation