configurable number of partitions used in the SparkCreateSimRels phase

2020-07-13 16:07:07 +02:00 · 2020-07-13 16:07:07 +02:00 · 8c67938ad0
parent c73168b18e
commit 8c67938ad0
4 changed files with 50 additions and 32 deletions
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.oa.dedup;
 import java.io.IOException;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -34,7 +35,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 	private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class);
-	public static final int NUM_PARTITIONS = 10000;
+	public static final int NUM_PARTITIONS = 1000;
 	public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) {
 		super(parser, spark);
@ -63,7 +64,12 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 		final String isLookUpUrl = parser.get("isLookUpUrl");
 		final String actionSetId = parser.get("actionSetId");
 		final String workingPath = parser.get("workingPath");
 		final int numPartitions = Optional
 				.ofNullable(parser.get("numPartitions"))
 				.map(Integer::valueOf)
 				.orElse(NUM_PARTITIONS);
 		log.info("numPartitions: '{}'", numPartitions);
 		log.info("graphBasePath: '{}'", graphBasePath);
 		log.info("isLookUpUrl:   '{}'", isLookUpUrl);
 		log.info("actionSetId:   '{}'", actionSetId);
@ -83,7 +89,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 			JavaPairRDD<String, MapDocument> mapDocuments = sc
 				.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
-				.repartition(NUM_PARTITIONS)
+				.repartition(numPartitions)
 				.mapToPair(
 					(PairFunction<String, String, MapDocument>) s -> {
 						MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
@ -93,13 +99,13 @@ public class SparkCreateSimRels extends AbstractSparkAction {
 			// create blocks for deduplication
 			JavaPairRDD<String, Block> blocks = Deduper
 				.createSortedBlocks(mapDocuments, dedupConf)
-				.repartition(NUM_PARTITIONS);
+				.repartition(numPartitions);
 			// create relations by comparing only elements in the same group
 			Deduper
 				.computeRelations(sc, blocks, dedupConf)
 				.map(t -> createSimRel(t._1(), t._2(), entity))
-				.repartition(NUM_PARTITIONS)
+				.repartition(numPartitions)
 				.map(r -> OBJECT_MAPPER.writeValueAsString(r))
 				.saveAsTextFile(outputPath);
 		}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json
@ -22,5 +22,11 @@
    "paramLongName": "workingPath",
    "paramDescription": "path of the working directory",
    "paramRequired": true
  },
  {
    "paramName": "np",
    "paramLongName": "numPartitions",
    "paramDescription": "number of partitions for the similarity relations intermediate phases",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
@ -20,6 +20,10 @@
            <name>dedupGraphPath</name>
            <description>path for the output graph</description>
        </property>
        <property>
            <name>cutConnectedComponent</name>
            <description>max number of elements in a connected component</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -106,10 +110,11 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
+            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
+            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="CreateMergeRel"/>
        <error to="Kill"/>
@ -132,10 +137,11 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
+            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
+            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
            <arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
        </spark>
        <ok to="CreateDedupRecord"/>
        <error to="Kill"/>
@ -158,10 +164,10 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
+            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
+            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
        </spark>
        <ok to="UpdateEntity"/>
        <error to="Kill"/>
@ -184,9 +190,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--o</arg><arg>${dedupGraphPath}</arg>
+            <arg>--dedupGraphPath</arg><arg>${dedupGraphPath}</arg>
        </spark>
        <ok to="copyRelations"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json
@ -1,17 +1,17 @@
 [
-{
+  {
-  "paramName": "i",
+    "paramName": "i",
-  "paramLongName": "graphBasePath",
+    "paramLongName": "graphBasePath",
-  "paramDescription": "the base path of raw graph",
+    "paramDescription": "the base path of raw graph",
-  "paramRequired": true
+    "paramRequired": true
-},
+  },
-{
+  {
-  "paramName": "w",
+    "paramName": "w",
-  "paramLongName": "workingPath",
+    "paramLongName": "workingPath",
-  "paramDescription": "the working directory path",
+    "paramDescription": "the working directory path",
-  "paramRequired": true
+    "paramRequired": true
-},
+  },
-{
+  {
    "paramName": "o",
    "paramLongName": "dedupGraphPath",
    "paramDescription": "the path of the dedup graph",