Update workflow.xml && job.properties

This commit is contained in:
Serafeim Chatzopoulos 2023-05-15 15:50:23 +03:00
parent 07818131ef
commit b8e8c959fe
2 changed files with 163 additions and 143 deletions

View File

@ -1,18 +1,16 @@
# The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters # The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters
# and concern the parameterization required for running workflows on the @GARR cluster # and concern the parameterization required for running workflows on the @GARR cluster
dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos # --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
dhp.hadoop.frontend.user.name=ilias.kanellos # dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl # dhp.hadoop.frontend.user.name=ilias.kanellos
dhp.hadoop.frontend.port.ssh=22 # dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie # dhp.hadoop.frontend.port.ssh=22
jobTracker=yarnRM # oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
nameNode=hdfs://nameservice1 # jobTracker=yarnRM
oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log # nameNode=hdfs://nameservice1
maven.executable=mvn # oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
sparkDriverMemory=7G # maven.executable=mvn
sparkExecutorMemory=7G
sparkExecutorCores=4
# Some memory and driver settings for more demanding tasks # Some memory and driver settings for more demanding tasks
sparkHighDriverMemory=20G sparkHighDriverMemory=20G
@ -21,6 +19,9 @@ sparkNormalDriverMemory=10G
sparkHighExecutorMemory=20G sparkHighExecutorMemory=20G
sparkNormalExecutorMemory=10G sparkNormalExecutorMemory=10G
sparkExecutorCores=4
sparkShufflePartitions=7680
# The above is given differently in an example I found online # The above is given differently in an example I found online
oozie.action.sharelib.for.spark=spark2 oozie.action.sharelib.for.spark=spark2
oozieActionShareLibForSpark2=spark2 oozieActionShareLibForSpark2=spark2
@ -66,29 +67,26 @@ ramGamma=0.6
convergenceError=0.000000000001 convergenceError=0.000000000001
# I think this should be the oozie workflow directory # I think this should be the oozie workflow directory
oozieWorkflowPath=user/ilias.kanellos/workflow_example/ # oozieWorkflowPath=user/ilias.kanellos/workflow_example/
# The directory where the workflow data is/should be stored
workflowDataDir=user/ilias.kanellos/ranking_workflow
# Directory where json data containing scores will be output # Directory where json data containing scores will be output
bipScorePath=${workflowDataDir}/openaire_universe_scores/ bipScorePath=${workingDir}/openaire_universe_scores/
# Directory where dataframes are checkpointed # Directory where dataframes are checkpointed
checkpointDir=${nameNode}/${workflowDataDir}/check/ checkpointDir=${nameNode}/${workingDir}/check/
# The directory for the doi-based bip graph # The directory for the doi-based bip graph
bipGraphFilePath=${nameNode}/${workflowDataDir}/bipdbv8_graph bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph
# The folder from which synonyms of openaire-ids are read # The folder from which synonyms of openaire-ids are read
# openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/ # openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/
openaireDataInput=${/tmp/prod_provision/graph/18_graph_blacklisted} openaireDataInput=/tmp/prod_provision/graph/18_graph_blacklisted
# A folder where we will write the openaire to doi mapping # A folder where we will write the openaire to doi mapping
synonymFolder=${nameNode}/${workflowDataDir}/openaireid_to_dois/ synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/
# This will be where we store the openaire graph input. They told us on GARR to use a directory under /data # This will be where we store the openaire graph input. They told us on GARR to use a directory under /data
openaireGraphInputPath=${nameNode}/${workflowDataDir}/openaire_id_graph openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph
# The workflow application path # The workflow application path
wfAppPath=${nameNode}/${oozieWorkflowPath} wfAppPath=${nameNode}/${oozieWorkflowPath}
@ -96,8 +94,8 @@ wfAppPath=${nameNode}/${oozieWorkflowPath}
oozie.wf.application.path=${wfAppPath} oozie.wf.application.path=${wfAppPath}
# Path where the final output should be? # Path where the final output should be?
actionSetOutputPath=${workflowDataDir}/bip_actionsets/ actionSetOutputPath=${workingDir}/bip_actionsets/
# The directory to store project impact indicators # The directory to store project impact indicators
projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators projectImpactIndicatorsOutput=${workingDir}/project_indicators

View File

@ -46,21 +46,23 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>create_openaire_ranking_graph.py</jar> <jar>create_openaire_ranking_graph.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 20G --executor-cores 4 --driver-memory 20G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkHighDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<!-- The openaire graph data from which to read relations and objects --> <!-- The openaire graph data from which to read relations and objects -->
<arg>${openaireDataInput}</arg> <arg>${openaireDataInput}</arg>
<!-- Year for filtering entries w/ larger values / empty --> <!-- Year for filtering entries w/ larger values / empty -->
<arg>${currentYear}</arg> <arg>${currentYear}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<!-- The output of the graph should be the openaire input graph for ranking--> <!-- The output of the graph should be the openaire input graph for ranking-->
<arg>${openaireGraphInputPath}</arg> <arg>${openaireGraphInputPath}</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
@ -100,18 +102,20 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>CC.py</jar> <jar>CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg> <arg>${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/CC.py#CC.py</file> <file>${wfAppPath}/CC.py#CC.py</file>
</spark> </spark>
@ -141,21 +145,23 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>TAR.py</jar> <jar>TAR.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg> <arg>${openaireGraphInputPath}</arg>
<arg>${ramGamma}</arg> <arg>${ramGamma}</arg>
<arg>${currentYear}</arg> <arg>${currentYear}</arg>
<arg>RAM</arg> <arg>RAM</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<arg>${checkpointDir}</arg> <arg>${checkpointDir}</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/TAR.py#TAR.py</file> <file>${wfAppPath}/TAR.py#TAR.py</file>
@ -189,18 +195,20 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>CC.py</jar> <jar>CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg> <arg>${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<arg>3</arg> <arg>3</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/CC.py#CC.py</file> <file>${wfAppPath}/CC.py#CC.py</file>
@ -244,21 +252,23 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>PageRank.py</jar> <jar>PageRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg> <arg>${openaireGraphInputPath}</arg>
<arg>${pageRankAlpha}</arg> <arg>${pageRankAlpha}</arg>
<arg>${convergenceError}</arg> <arg>${convergenceError}</arg>
<arg>${checkpointDir}</arg> <arg>${checkpointDir}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<arg>dfs</arg> <arg>dfs</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/PageRank.py#PageRank.py</file> <file>${wfAppPath}/PageRank.py#PageRank.py</file>
@ -289,14 +299,16 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>AttRank.py</jar> <jar>AttRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg> <arg>${openaireGraphInputPath}</arg>
<arg>${attrankAlpha}</arg> <arg>${attrankAlpha}</arg>
@ -308,7 +320,7 @@
<arg>${convergenceError}</arg> <arg>${convergenceError}</arg>
<arg>${checkpointDir}</arg> <arg>${checkpointDir}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<arg>dfs</arg> <arg>dfs</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/AttRank.py#AttRank.py</file> <file>${wfAppPath}/AttRank.py#AttRank.py</file>
@ -339,7 +351,7 @@
<!-- name of script to run --> <!-- name of script to run -->
<argument>get_ranking_files.sh</argument> <argument>get_ranking_files.sh</argument>
<!-- We only pass the directory where we expect to find the rankings --> <!-- We only pass the directory where we expect to find the rankings -->
<argument>/${workflowDataDir}</argument> <argument>/${workingDir}</argument>
<!-- the name of the file run --> <!-- the name of the file run -->
<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file> <file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
@ -381,24 +393,26 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>format_ranking_results.py</jar> <jar>format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkNormalExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>json-5-way</arg> <arg>json-5-way</arg>
<!-- Input files must be identified dynamically --> <!-- Input files must be identified dynamically -->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions --> <!-- Num partitions -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] --> <!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
<arg>openaire</arg> <arg>openaire</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
@ -429,24 +443,26 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>format_ranking_results.py</jar> <jar>format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkNormalExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>zenodo</arg> <arg>zenodo</arg>
<!-- Input files must be identified dynamically --> <!-- Input files must be identified dynamically -->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions --> <!-- Num partitions -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] --> <!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
<arg>openaire</arg> <arg>openaire</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
@ -484,14 +500,16 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>map_openaire_ids_to_dois.py</jar> <jar>map_openaire_ids_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkHighDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${openaireDataInput}</arg> <arg>${openaireDataInput}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
@ -526,24 +544,26 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>map_scores_to_dois.py</jar> <jar>map_scores_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkHighDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
<arg>${synonymFolder}</arg> <arg>${synonymFolder}</arg>
<!-- Number of partitions --> <!-- Number of partitions -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<!-- The remaining input are the ranking files fproduced for bip db--> <!-- The remaining input are the ranking files fproduced for bip db-->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- This needs to point to the file on the hdfs i think --> <!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file> <file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
@ -576,9 +596,9 @@
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class> <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -609,14 +629,16 @@
<!-- Script name goes here --> <!-- Script name goes here -->
<jar>projects_impact.py</jar> <jar>projects_impact.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro --> <!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G <spark-opts>
--master yarn --executor-memory=${sparkHighExecutorMemory}
--deploy-mode cluster --executor-cores=${sparkExecutorCores}
--conf spark.sql.shuffle.partitions=7680 --driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here --> <!-- Script arguments here -->
@ -624,13 +646,13 @@
<arg>${openaireDataInput}/relations</arg> <arg>${openaireDataInput}/relations</arg>
<!-- input files with impact indicators for results --> <!-- input files with impact indicators for results -->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg> <arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<!-- number of partitions to be used on joins --> <!-- number of partitions to be used on joins -->
<arg>7680</arg> <arg>${sparkShufflePartitions}</arg>
<arg>${projectImpactIndicatorsOutput}</arg> <arg>${projectImpactIndicatorsOutput}</arg>
@ -654,9 +676,9 @@
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class> <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}