2023-03-14 18:28:27 +01:00
<workflow-app xmlns= "uri:oozie:workflow:0.5" name= "ranking-wf" >
<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
<!-- <start to="get - doi - synonyms" /> -->
<start to= "entry-point-decision" />
<decision name= "entry-point-decision" >
<switch >
<!-- The default will be set as the normal start, a.k.a. get - doi - synonyms -->
<!-- If any different condition is set, go to the corresponding start -->
<case to= "non-iterative-rankings" > ${resume eq "rankings-start"}</case>
<case to= "spark-impulse" > ${resume eq "impulse"}</case>
<case to= "iterative-rankings" > ${resume eq "rankings-iterative"}</case>
<case to= "get-file-names" > ${resume eq "format-results"}</case>
<case to= "map-openaire-to-doi" > ${resume eq "map-ids"}</case>
<case to= "map-scores-to-dois" > ${resume eq "map-scores"}</case>
<case to= "create-openaire-ranking-graph" > ${resume eq "start"}</case>
2023-04-07 15:30:12 +02:00
<case to= "project-impact-indicators" > ${resume eq "projects-impact"}</case>
2023-03-14 18:28:27 +01:00
<!-- TODO: add action set creation here -->
<default to= "create-openaire-ranking-graph" />
</switch>
</decision>
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name= "create-openaire-ranking-graph" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- Delete previously created doi synonym folder -->
2023-04-28 12:45:19 +02:00
<!-- I think we don't need this given we don't have synonyms anymore
2023-03-14 18:28:27 +01:00
<prepare >
<delete path= "${synonymFolder}" />
</prepare>
2023-04-28 12:45:19 +02:00
-->
2023-03-14 18:28:27 +01:00
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Openaire Ranking Graph Creation</name>
<!-- Script name goes here -->
<jar > create_openaire_ranking_graph.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<!-- The openaire graph data from which to read relations and objects -->
<arg > ${openaireDataInput}</arg>
2023-05-15 14:53:12 +02:00
<!-- Year for filtering entries w/ larger values / empty -->
2023-03-14 18:28:27 +01:00
<arg > ${currentYear}</arg>
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<!-- The output of the graph should be the openaire input graph for ranking -->
<arg > ${openaireGraphInputPath}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
2023-04-28 12:09:13 +02:00
<ok to= "end" />
2023-03-14 18:28:27 +01:00
<!-- Go there if we have an error -->
<error to= "openaire-graph-error" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<!-- Citation Count and RAM are calculated in parallel -->
<!-- Impulse Requires resources and will be run after -->
<fork name= "non-iterative-rankings" >
<path start= "spark-cc" />
<!-- <path start="spark - impulse"/> -->
<path start= "spark-ram" />
</fork>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- CC here -->
<action name= "spark-cc" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-04-26 19:40:06 +02:00
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
2023-03-14 18:28:27 +01:00
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Spark CC</name>
<!-- Script name goes here -->
<jar > CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/CC.py#CC.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "join-non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "cc-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<!-- IMPULSE here -->
<action name= "spark-ram" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
<!-- using configs from an example on openaire -->
2023-03-14 18:28:27 +01:00
<master > yarn-cluster</master>
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Spark RAM</name>
<!-- Script name goes here -->
<jar > TAR.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<arg > ${ramGamma}</arg>
<arg > ${currentYear}</arg>
<arg > RAM</arg>
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-04-28 12:09:13 +02:00
<arg > ${checkpointDir}</arg>
2023-03-14 18:28:27 +01:00
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/TAR.py#TAR.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "join-non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "ram-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<!-- JOIN NON - ITERATIVE METHODS AND THEN CONTINUE TO ITERATIVE ONES -->
<join name= "join-non-iterative-rankings" to= "spark-impulse" />
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- IMPULSE here -->
<action name= "spark-impulse" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-04-26 19:40:06 +02:00
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
2023-03-14 18:28:27 +01:00
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Spark Impulse</name>
<!-- Script name goes here -->
<jar > CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<arg > 3</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/CC.py#CC.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "impulse-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<fork name= "iterative-rankings" >
<path start= "spark-pagerank" />
<path start= "spark-attrank" />
</fork>
<!-- PAGERANK here -->
<action name= "spark-pagerank" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- we could add map - reduce configs here, but I don't know if we need them -->
<!-- This is the type of master - client configuration for running spark -->
<!-- <master>yarn - client</master> -->
<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn - cluster, yarn - master, or local. -->
<!-- <master>local[*]</master> -->
<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
2023-04-26 19:40:06 +02:00
<!-- <mode>client</mode> -->
2023-05-15 14:53:12 +02:00
<!-- using configs from an example on openaire -->
2023-03-14 18:28:27 +01:00
<master > yarn-cluster</master>
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Spark Pagerank</name>
<!-- Script name goes here -->
<jar > PageRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<arg > ${pageRankAlpha}</arg>
<arg > ${convergenceError}</arg>
<arg > ${checkpointDir}</arg>
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<arg > dfs</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/PageRank.py#PageRank.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "join-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "pagerank-fail" />
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
</action>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- ATTRANK here -->
<action name= "spark-attrank" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
<!-- using configs from an example on openaire -->
2023-03-14 18:28:27 +01:00
<master > yarn-cluster</master>
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Spark AttRank</name>
<!-- Script name goes here -->
<jar > AttRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<arg > ${attrankAlpha}</arg>
<arg > ${attrankBeta}</arg>
<arg > ${attrankGamma}</arg>
<arg > ${attrankRho}</arg>
<arg > ${currentYear}</arg>
<arg > ${attrankStartYear}</arg>
<arg > ${convergenceError}</arg>
<arg > ${checkpointDir}</arg>
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<arg > dfs</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/AttRank.py#AttRank.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "join-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "attrank-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<!-- JOIN ITERATIVE METHODS AND THEN END -->
<join name= "join-iterative-rankings" to= "get-file-names" />
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This will be a shell action that will output key - value pairs for output files -->
<action name= "get-file-names" >
<!-- This is required as a tag for shell jobs -->
<shell xmlns= "uri:oozie:shell-action:0.3" >
<!-- Same for all -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
2023-04-26 19:40:06 +02:00
<!-- Exec is needed for shell commands - points to type of shell command -->
<exec > /usr/bin/bash</exec>
<!-- name of script to run -->
<argument > get_ranking_files.sh</argument>
<!-- We only pass the directory where we expect to find the rankings -->
2023-05-15 14:50:23 +02:00
<argument > /${workingDir}</argument>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- the name of the file run -->
<file > ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
<!-- Get the output in order to be usable by following actions -->
<capture-output />
</shell>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "format-result-files" />
<!-- Go there if we have an error -->
<error to= "filename-getting-error" />
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
</action>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
<fork name= "format-result-files" >
<path start= "format-bip-files" />
<path start= "format-json-files" />
</fork>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Format json files -->
<!-- Two parts: a) format files b) make the file endings .json.gz -->
<action name= "format-json-files" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
2023-04-26 19:40:06 +02:00
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
2023-03-14 18:28:27 +01:00
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Format Ranking Results JSON</name>
<!-- Script name goes here -->
<jar > format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
2023-05-11 13:42:25 +02:00
<arg > json-5-way</arg>
2023-03-14 18:28:27 +01:00
<!-- Input files must be identified dynamically -->
2023-05-15 14:50:23 +02:00
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
2023-03-14 18:28:27 +01:00
<!-- Num partitions -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<!-- Type of data to be produced [bip (dois) / openaire (openaire - ids) ] -->
<arg > openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
2023-05-15 14:53:12 +02:00
</spark>
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "join-file-formatting" />
<!-- Go there if we have an error -->
<error to= "json-formatting-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
<action name= "format-bip-files" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
2023-04-26 19:40:06 +02:00
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
2023-03-14 18:28:27 +01:00
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Format Ranking Results BiP! DB</name>
<!-- Script name goes here -->
<jar > format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > zenodo</arg>
<!-- Input files must be identified dynamically -->
2023-05-15 14:50:23 +02:00
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
2023-03-14 18:28:27 +01:00
<!-- Num partitions -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<!-- Type of data to be produced [bip (dois) / openaire (openaire - ids) ] -->
<arg > openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
2023-05-15 14:53:12 +02:00
</spark>
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "join-file-formatting" />
<!-- Go there if we have an error -->
<error to= "bip-formatting-fail" />
2023-05-15 14:53:12 +02:00
</action>
<!-- Finish formatting data and end -->
<join name= "join-file-formatting" to= "map-openaire-to-doi" />
2023-03-14 18:28:27 +01:00
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name= "map-openaire-to-doi" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- Delete previously created doi synonym folder -->
<prepare >
<delete path= "${synonymFolder}" />
</prepare>
2023-05-15 14:53:12 +02:00
<!-- using configs from an example on openaire -->
2023-03-14 18:28:27 +01:00
<master > yarn-cluster</master>
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Openaire-DOI synonym collection</name>
<!-- Script name goes here -->
<jar > map_openaire_ids_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${openaireDataInput}</arg>
<!-- number of partitions to be used on joins -->
<arg > ${synonymFolder}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
<ok to= "map-scores-to-dois" />
<!-- Go there if we have an error -->
<error to= "synonym-collection-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-14 18:28:27 +01:00
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name= "map-scores-to-dois" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
2023-05-15 14:53:12 +02:00
<!-- using configs from an example on openaire -->
2023-03-14 18:28:27 +01:00
<master > yarn-cluster</master>
<mode > cluster</mode>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- This is the name of our job -->
<name > Mapping Openaire Scores to DOIs</name>
<!-- Script name goes here -->
<jar > map_scores_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-05-15 14:59:51 +02:00
2023-03-14 18:28:27 +01:00
<!-- Script arguments here -->
<arg > ${synonymFolder}</arg>
<!-- Number of partitions -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-03-14 18:28:27 +01:00
<!-- The remaining input are the ranking files fproduced for bip db -->
2023-05-15 14:50:23 +02:00
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
2023-03-14 18:28:27 +01:00
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
</spark>
2023-05-15 14:53:12 +02:00
2023-03-14 18:28:27 +01:00
<!-- Do this after finishing okay -->
2023-03-21 15:14:15 +01:00
<ok to= "deleteOutputPathForActionSet" />
2023-03-14 18:28:27 +01:00
<!-- Go there if we have an error -->
<error to= "map-scores-fail" />
2023-05-15 14:53:12 +02:00
</action>
2023-03-21 15:14:15 +01:00
2023-04-07 15:30:12 +02:00
<action name= "deleteOutputPathForActionSet" >
2023-03-21 15:14:15 +01:00
<fs >
2023-04-27 11:37:15 +02:00
<delete path= "${actionSetOutputPath}/results/" />
<delete path= "${actionSetOutputPath}/projects/" />
<mkdir path= "${actionSetOutputPath}/results/" />
<mkdir path= "${actionSetOutputPath}/projects/" />
</fs>
2023-04-26 19:40:06 +02:00
<ok to= "createActionSetForResults" />
2023-03-21 15:14:15 +01:00
<error to= "actionset-delete-fail" />
</action>
2023-04-26 19:40:06 +02:00
<action name= "createActionSetForResults" >
2023-03-21 15:14:15 +01:00
<spark xmlns= "uri:oozie:spark-action:0.2" >
<master > yarn</master>
<mode > cluster</mode>
<name > Produces the atomic action with the bip finder scores for publications</name>
<class > eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar > dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts >
2023-05-15 14:50:23 +02:00
--executor-memory=${sparkNormalExecutorMemory}
2023-03-21 15:14:15 +01:00
--executor-cores=${sparkExecutorCores}
2023-05-15 14:50:23 +02:00
--driver-memory=${sparkNormalDriverMemory}
2023-03-21 15:14:15 +01:00
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg > --inputPath</arg> <arg > ${bipScorePath}</arg>
2023-04-27 11:37:15 +02:00
<arg > --outputPath</arg> <arg > ${actionSetOutputPath}/results/</arg>
2023-04-26 19:40:06 +02:00
<arg > --targetEntity</arg> <arg > result</arg>
</spark>
2023-04-07 15:30:12 +02:00
<ok to= "project-impact-indicators" />
2023-03-21 15:14:15 +01:00
<error to= "actionset-creation-fail" />
</action>
2023-04-07 15:30:12 +02:00
<action name= "project-impact-indicators" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
2023-04-18 11:26:01 +02:00
<name > Project Impact Indicators</name>
2023-04-07 15:30:12 +02:00
<!-- Script name goes here -->
2023-04-18 11:26:01 +02:00
<jar > projects_impact.py</jar>
2023-04-07 15:30:12 +02:00
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
2023-05-15 14:59:51 +02:00
2023-05-15 14:50:23 +02:00
<spark-opts >
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
2023-04-07 15:30:12 +02:00
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
2023-05-15 14:50:23 +02:00
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
2023-04-07 15:30:12 +02:00
<!-- Script arguments here -->
<!-- graph data folder from which to read relations -->
<arg > ${openaireDataInput}/relations</arg>
<!-- input files with impact indicators for results -->
2023-05-15 14:50:23 +02:00
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
2023-04-07 15:30:12 +02:00
<!-- number of partitions to be used on joins -->
2023-05-15 14:50:23 +02:00
<arg > ${sparkShufflePartitions}</arg>
2023-04-07 15:30:12 +02:00
<arg > ${projectImpactIndicatorsOutput}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/projects_impact.py#projects_impact.py</file>
</spark>
<!-- Do this after finishing okay -->
2023-04-26 19:40:06 +02:00
<ok to= "createActionSetForProjects" />
2023-04-07 15:30:12 +02:00
<!-- Go there if we have an error -->
<error to= "project-impact-indicators-fail" />
</action>
2023-03-14 18:28:27 +01:00
2023-04-26 19:40:06 +02:00
<action name= "createActionSetForProjects" >
<spark xmlns= "uri:oozie:spark-action:0.2" >
<master > yarn</master>
<mode > cluster</mode>
<name > Produces the atomic action with the bip finder scores for projects</name>
<class > eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar > dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts >
2023-05-15 14:50:23 +02:00
--executor-memory=${sparkNormalExecutorMemory}
2023-04-26 19:40:06 +02:00
--executor-cores=${sparkExecutorCores}
2023-05-15 14:50:23 +02:00
--driver-memory=${sparkNormalDriverMemory}
2023-04-26 19:40:06 +02:00
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg > --inputPath</arg> <arg > ${projectImpactIndicatorsOutput}</arg>
2023-04-27 11:37:15 +02:00
<arg > --outputPath</arg> <arg > ${actionSetOutputPath}/projects/</arg>
2023-04-26 19:40:06 +02:00
<arg > --targetEntity</arg> <arg > project</arg>
</spark>
<ok to= "end" />
<error to= "actionset-project-creation-fail" />
</action>
2023-03-14 18:28:27 +01:00
<!-- TODO: end the workflow -->
<!-- Define ending node -->
<end name= "end" />
<!-- Definitions of failure messages -->
<kill name= "pagerank-fail" >
<message > PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "attrank-fail" >
<message > AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "cc-fail" >
<message > CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "impulse-fail" >
<message > Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "ram-fail" >
<message > RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "openaire-graph-error" >
<message > Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "synonym-collection-fail" >
<message > Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "map-scores-fail" >
<message > Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
2023-03-21 15:14:15 +01:00
<kill name= "actionset-delete-fail" >
<message > Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "actionset-creation-fail" >
2023-04-26 19:40:06 +02:00
<message > ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
2023-04-07 15:30:12 +02:00
</kill>
2023-03-21 15:14:15 +01:00
2023-04-07 15:30:12 +02:00
<kill name= "project-impact-indicators-fail" >
<message > Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
2023-03-21 17:24:12 +01:00
2023-04-26 19:40:06 +02:00
<kill name= "actionset-project-creation-fail" >
<message > ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
2023-03-14 18:28:27 +01:00
</workflow-app>