increased number of partitions produced by the join_all_entities phase as well as spark.sql.shuffle.partitions in adjancency_lists phase

2020-05-28 13:49:59 +02:00 · 2020-05-28 13:49:59 +02:00 · 5dea155a87
parent fdd54bad1c
commit 5dea155a87
1 changed files with 2 additions and 2 deletions
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -362,7 +362,7 @@
            <arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
            <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
            <arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
-            <arg>--numPartitions</arg><arg>24000</arg>
+            <arg>--numPartitions</arg><arg>35000</arg>
        </spark>
        <ok to="adjancency_lists"/>
        <error to="Kill"/>
@ -383,7 +383,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
+                --conf spark.sql.shuffle.partitions=15000
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${workingDir}/join_entities</arg>