removed dataset caching as it seems to impact too much on the memory footprint

2025-01-08 15:44:58 +01:00 · 2025-01-08 15:44:58 +01:00 · 3549b6976d
parent 6c31fddd03
commit 3549b6976d
2 changed files with 1 additions and 1 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@ -639,6 +639,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--hdfsPath</arg><arg>${workingDir}/graph_raw</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
@ -62,7 +62,6 @@ object CopyHdfsOafSparkApplication {
      val oaf = spark.read
        .textFile(validPaths: _*)
        .map(v => (getOafType(v), v))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-        .cache()

      try {
        ModelSupport.oafTypes