From 34358afe756d2ea602a1aa7e58969b2adbb204cf Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 20 Oct 2023 15:48:27 +0200 Subject: [PATCH] modified resource file, workflow anf default-config. Add 3g of memory Overhead and specified the shuffle partition in the wf confiduration. Removed the multiple instantiation in the wf because of different implementation of the spark job --- .../dhp/bulktag/input_bulkTag_parameters.json | 28 ++--- .../dhp/bulktag/oozie_app/config-default.xml | 10 +- .../dhp/bulktag/oozie_app/workflow.xml | 108 ++---------------- 3 files changed, 25 insertions(+), 121 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json index a8be7c32e..dbe2d088f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json @@ -1,10 +1,5 @@ [ - { - "paramName":"is", - "paramLongName":"isLookUpUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, + { "paramName":"s", "paramLongName":"sourcePath", @@ -17,12 +12,7 @@ "paramDescription": "the json path associated to each selection field", "paramRequired": true }, - { - "paramName":"tn", - "paramLongName":"resultTableName", - "paramDescription": "the name of the result table we are currently working on", - "paramRequired": true - }, + { "paramName": "out", "paramLongName": "outputPath", @@ -35,17 +25,19 @@ "paramDescription": "true if the spark session is managed, false otherwise", "paramRequired": false }, - { - "paramName": "test", - "paramLongName": "isTest", - "paramDescription": "Parameter intended for testing purposes only. True if the reun is relatesd to a test and so the taggingConf parameter should be loaded", - "paramRequired": false - }, + { "paramName": "tg", "paramLongName": "taggingConf", "paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed", "paramRequired": false + }, + + { + "paramName": "p", + "paramLongName": "production", + "paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml index fe82ae194..c92f559f9 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml @@ -45,10 +45,14 @@ sparkExecutorMemory - 6G + 5G - sparkExecutorCores - 1 + memoryOverhead + 3g + + + partitions + 3284 \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index b868e4c72..4b81c58e4 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -4,10 +4,6 @@ sourcePath the source path - - isLookUpUrl - the isLookup service endpoint - pathMap the json path associated to each selection field @@ -102,16 +98,9 @@ - + - - - - - - - - + yarn-cluster cluster @@ -122,104 +111,23 @@ --num-executors=${sparkExecutorNumber} --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} + --conf spark.executor.memoryOverhead=${memeoryOverhead} + --conf spark.sql.shuffle.partitions=${partitions} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --sourcePath${sourcePath}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${outputPath}/publication + --sourcePath${sourcePath}/ + --outputPath${outputPath}/ --pathMap${pathMap} - --isLookUpUrl${isLookUpUrl} + --production${production} - + - - - yarn-cluster - cluster - bulkTagging-dataset - eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-enrichment-${projectVersion}.jar - - --num-executors=${sparkExecutorNumber} - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${sourcePath}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${outputPath}/dataset - --pathMap${pathMap} - --isLookUpUrl${isLookUpUrl} - - - - - - - - yarn-cluster - cluster - bulkTagging-orp - eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-enrichment-${projectVersion}.jar - - --num-executors=${sparkExecutorNumber} - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${sourcePath}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${outputPath}/otherresearchproduct - --pathMap${pathMap} - --isLookUpUrl${isLookUpUrl} - - - - - - - - yarn-cluster - cluster - bulkTagging-software - eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-enrichment-${projectVersion}.jar - - --num-executors=${sparkExecutorNumber} - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${sourcePath}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${outputPath}/software - --pathMap${pathMap} - --isLookUpUrl${isLookUpUrl} - - - - - -