2023-03-14 18:28:27 +01:00
<workflow-app xmlns= "uri:oozie:workflow:0.5" name= "ranking-wf" >
<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
<!-- <start to="get - doi - synonyms" /> -->
<start to= "entry-point-decision" />
<decision name= "entry-point-decision" >
<switch >
<!-- The default will be set as the normal start, a.k.a. get - doi - synonyms -->
<!-- If any different condition is set, go to the corresponding start -->
<case to= "non-iterative-rankings" > ${resume eq "rankings-start"}</case>
<case to= "spark-impulse" > ${resume eq "impulse"}</case>
<case to= "iterative-rankings" > ${resume eq "rankings-iterative"}</case>
<case to= "get-file-names" > ${resume eq "format-results"}</case>
<case to= "map-openaire-to-doi" > ${resume eq "map-ids"}</case>
<case to= "map-scores-to-dois" > ${resume eq "map-scores"}</case>
<case to= "create-openaire-ranking-graph" > ${resume eq "start"}</case>
<!-- TODO: add action set creation here -->
<default to= "create-openaire-ranking-graph" />
</switch>
</decision>
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name= "create-openaire-ranking-graph" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- Delete previously created doi synonym folder -->
<prepare >
<delete path= "${synonymFolder}" />
</prepare>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Openaire Ranking Graph Creation</name>
<!-- Script name goes here -->
<jar > create_openaire_ranking_graph.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 20G --executor-cores 4 --driver-memory 20G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<!-- The openaire graph data from which to read relations and objects -->
<arg > ${openaireDataInput}</arg>
<!-- Year for filtering entries w/ larger values / empty -->
<arg > ${currentYear}</arg>
<!-- number of partitions to be used on joins -->
<arg > 7680</arg>
<!-- The output of the graph should be the openaire input graph for ranking -->
<arg > ${openaireGraphInputPath}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "openaire-graph-error" />
</action>
<!-- Citation Count and RAM are calculated in parallel -->
<!-- Impulse Requires resources and will be run after -->
<fork name= "non-iterative-rankings" >
<path start= "spark-cc" />
<!-- <path start="spark - impulse"/> -->
<path start= "spark-ram" />
</fork>
<!-- CC here -->
<action name= "spark-cc" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Spark CC</name>
<!-- Script name goes here -->
<jar > CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins -->
<arg > 7680</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/CC.py#CC.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "join-non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "cc-fail" />
</action>
<!-- IMPULSE here -->
<action name= "spark-ram" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Spark RAM</name>
<!-- Script name goes here -->
<jar > TAR.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<arg > ${ramGamma}</arg>
<arg > ${currentYear}</arg>
<arg > RAM</arg>
<!-- number of partitions to be used on joins -->
<arg > 7680</arg>
<arg > ${γιτ α }</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/TAR.py#TAR.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "join-non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "ram-fail" />
</action>
<!-- JOIN NON - ITERATIVE METHODS AND THEN CONTINUE TO ITERATIVE ONES -->
<join name= "join-non-iterative-rankings" to= "spark-impulse" />
<!-- IMPULSE here -->
<action name= "spark-impulse" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Spark Impulse</name>
<!-- Script name goes here -->
<jar > CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins -->
<arg > 7680</arg>
<arg > 3</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/CC.py#CC.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "impulse-fail" />
</action>
<fork name= "iterative-rankings" >
<path start= "spark-pagerank" />
<path start= "spark-attrank" />
</fork>
<!-- PAGERANK here -->
<action name= "spark-pagerank" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- we could add map - reduce configs here, but I don't know if we need them -->
<!-- This is the type of master - client configuration for running spark -->
<!-- <master>yarn - client</master> -->
<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn - cluster, yarn - master, or local. -->
<!-- <master>local[*]</master> -->
<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
<!-- <mode>client</mode> -->
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Spark Pagerank</name>
<!-- Script name goes here -->
<jar > PageRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<arg > ${pageRankAlpha}</arg>
<arg > ${convergenceError}</arg>
<arg > ${checkpointDir}</arg>
<!-- number of partitions to be used on joins -->
<arg > 7680</arg>
<arg > dfs</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/PageRank.py#PageRank.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "join-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "pagerank-fail" />
</action>
<!-- ATTRANK here -->
<action name= "spark-attrank" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Spark AttRank</name>
<!-- Script name goes here -->
<jar > AttRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${openaireGraphInputPath}</arg>
<arg > ${attrankAlpha}</arg>
<arg > ${attrankBeta}</arg>
<arg > ${attrankGamma}</arg>
<arg > ${attrankRho}</arg>
<arg > ${currentYear}</arg>
<arg > ${attrankStartYear}</arg>
<arg > ${convergenceError}</arg>
<arg > ${checkpointDir}</arg>
<!-- number of partitions to be used on joins -->
<arg > 7680</arg>
<arg > dfs</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/AttRank.py#AttRank.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "join-iterative-rankings" />
<!-- Go there if we have an error -->
<error to= "attrank-fail" />
</action>
<!-- JOIN ITERATIVE METHODS AND THEN END -->
<join name= "join-iterative-rankings" to= "get-file-names" />
<!-- This will be a shell action that will output key - value pairs for output files -->
<action name= "get-file-names" >
<!-- This is required as a tag for shell jobs -->
<shell xmlns= "uri:oozie:shell-action:0.3" >
<!-- Same for all -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs -->
<name-node > ${nameNode}</name-node>
<!-- Exec is needed foor shell comands - points to type of shell command -->
<exec > /usr/bin/bash</exec>
<!-- name of script to run -->
<argument > get_ranking_files.sh</argument>
<!-- We only pass the directory where we expect to find the rankings -->
<argument > /${workflowDataDir}</argument>
<!-- the name of the file run -->
<file > ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
<!-- Get the output in order to be usable by following actions -->
<capture-output />
</shell>
<!-- Do this after finishing okay -->
<ok to= "format-result-files" />
<!-- Go there if we have an error -->
<error to= "filename-getting-error" />
</action>
<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
<fork name= "format-result-files" >
<path start= "format-bip-files" />
<path start= "format-json-files" />
</fork>
<!-- Format json files -->
<!-- Two parts: a) format files b) make the file endings .json.gz -->
<action name= "format-json-files" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Format Ranking Results JSON</name>
<!-- Script name goes here -->
<jar > format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 10G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > json</arg>
<!-- Input files must be identified dynamically -->
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions -->
<arg > 7680</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire - ids) ] -->
<arg > openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "join-file-formatting" />
<!-- Go there if we have an error -->
<error to= "json-formatting-fail" />
</action>
<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
<action name= "format-bip-files" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Format Ranking Results BiP! DB</name>
<!-- Script name goes here -->
<jar > format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 10G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > zenodo</arg>
<!-- Input files must be identified dynamically -->
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions -->
<arg > 7680</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire - ids) ] -->
<arg > openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "join-file-formatting" />
<!-- Go there if we have an error -->
<error to= "bip-formatting-fail" />
</action>
<!-- Finish formatting data and end -->
<join name= "join-file-formatting" to= "map-openaire-to-doi" />
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name= "map-openaire-to-doi" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- Delete previously created doi synonym folder -->
<prepare >
<delete path= "${synonymFolder}" />
</prepare>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Openaire-DOI synonym collection</name>
<!-- Script name goes here -->
<jar > map_openaire_ids_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 15G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${openaireDataInput}</arg>
<!-- number of partitions to be used on joins -->
<arg > ${synonymFolder}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to= "map-scores-to-dois" />
<!-- Go there if we have an error -->
<error to= "synonym-collection-fail" />
</action>
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name= "map-scores-to-dois" >
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns= "uri:oozie:spark-action:0.2" >
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker > ${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node > ${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master > yarn-cluster</master>
<mode > cluster</mode>
<!-- This is the name of our job -->
<name > Mapping Openaire Scores to DOIs</name>
<!-- Script name goes here -->
<jar > map_scores_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts > --executor-memory 18G --executor-cores 4 --driver-memory 15G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg > ${synonymFolder}</arg>
<!-- Number of partitions -->
<arg > 7680</arg>
<!-- The remaining input are the ranking files fproduced for bip db -->
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg > ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file > ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
</spark>
<!-- Do this after finishing okay -->
2023-03-21 17:24:12 +01:00
<ok to= "deleteOutputPathForActionSet" />
2023-03-14 18:28:27 +01:00
<!-- Go there if we have an error -->
<error to= "map-scores-fail" />
2023-03-21 17:24:12 +01:00
</action>
<action name= "deleteOutputPathForActionSet" >
<fs >
<delete path= "${actionSetOutputPath}" />
<mkdir path= "${actionSetOutputPath}" />
<!--
<delete path= "${workingDir}" />
<mkdir path= "${workingDir}" />
-->
</fs>
<ok to= "createActionSet" />
<error to= "actionset-delete-fail" />
</action>
<action name= "createActionSet" >
<spark xmlns= "uri:oozie:spark-action:0.2" >
<master > yarn</master>
<mode > cluster</mode>
<name > Produces the atomic action with the bip finder scores for publications</name>
<class > eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar > dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts >
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg > --inputPath</arg> <arg > ${bipScorePath}</arg>
<arg > --outputPath</arg> <arg > ${actionSetOutputPath}</arg>
</spark>
<ok to= "end" />
<error to= "actionset-creation-fail" />
</action>
2023-03-14 18:28:27 +01:00
<!-- TODO: end the workflow -->
<!-- Define ending node -->
<end name= "end" />
<!-- Definitions of failure messages -->
<kill name= "pagerank-fail" >
<message > PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "attrank-fail" >
<message > AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "cc-fail" >
<message > CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "impulse-fail" >
<message > Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "ram-fail" >
<message > RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "openaire-graph-error" >
<message > Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "synonym-collection-fail" >
<message > Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "map-scores-fail" >
<message > Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
2023-03-21 17:24:12 +01:00
<kill name= "actionset-delete-fail" >
<message > Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name= "actionset-creation-fail" >
<message > ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
2023-03-14 18:28:27 +01:00
</workflow-app>