From 3549b6976d23bd19426eb27ba8ccfa003487ee70 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 Jan 2025 15:44:58 +0100 Subject: [PATCH] removed dataset caching as it seems to impact too much on the memory footprint --- .../eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml | 1 + .../dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 3c674fccd..064c794ca 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -639,6 +639,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=8000 --masteryarn --hdfsPath${workingDir}/graph_raw diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index 1177b34f4..3f0a4b75a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -62,7 +62,6 @@ object CopyHdfsOafSparkApplication { val oaf = spark.read .textFile(validPaths: _*) .map(v => (getOafType(v), v))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .cache() try { ModelSupport.oafTypes