2023-08-14 15:50:48 +02:00
1 changed files with 187 additions and 70 deletions
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@ -46,6 +46,7 @@
 			<!-- Script name goes here -->
 			<jar>create_openaire_ranking_graph.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -56,6 +57,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<!-- The openaire graph data from which to read relations and objects -->
 			<arg>${openaireDataInput}</arg>
@ -102,6 +113,7 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -112,6 +124,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
@ -145,6 +167,7 @@
 			<!-- Script name goes here -->
 			<jar>TAR.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -155,6 +178,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${ramGamma}</arg>
@ -195,6 +228,7 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -205,6 +239,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
@ -252,6 +296,7 @@
 			<!-- Script name goes here -->
 			<jar>PageRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -262,6 +307,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${pageRankAlpha}</arg>
@ -299,6 +354,7 @@
 			<!-- Script name goes here -->
 			<jar>AttRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -309,6 +365,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${attrankAlpha}</arg>
@ -393,6 +459,7 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -403,6 +470,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>json-5-way</arg>
 			<!-- Input files must be identified dynamically -->
@ -443,6 +520,7 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -453,6 +531,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>zenodo</arg>
 			<!-- Input files must be identified dynamically -->
@ -500,6 +588,7 @@
 			<!-- Script name goes here -->
 			<jar>map_openaire_ids_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -510,6 +599,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${openaireDataInput}</arg>
 			<!-- number of partitions to be used on joins -->
@ -544,6 +643,7 @@
 			<!-- Script name goes here -->
 			<jar>map_scores_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@ -554,6 +654,16 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 			<!-- Script arguments here -->
 			<arg>${synonymFolder}</arg>
 			<!-- Number of partitions -->
@ -629,11 +739,18 @@
 			<!-- Script name goes here -->
 			<jar>projects_impact.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+<<<<<<< HEAD
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+=======
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
+				--master yarn
+				--deploy-mode cluster
+				--conf spark.sql.shuffle.partitions=7680
+>>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}