dnet-hadoop/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml

648 lines
28 KiB
XML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<workflow-app xmlns="uri:oozie:workflow:0.5" name="ranking-wf">
<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
<!-- <start to="get-doi-synonyms" /> -->
<start to="entry-point-decision" />
<decision name="entry-point-decision">
<switch>
<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
<!-- If any different condition is set, go to the corresponding start -->
<case to="non-iterative-rankings">${resume eq "rankings-start"}</case>
<case to="spark-impulse">${resume eq "impulse"}</case>
<case to="iterative-rankings">${resume eq "rankings-iterative"}</case>
<case to="get-file-names">${resume eq "format-results"}</case>
<case to="map-openaire-to-doi">${resume eq "map-ids"}</case>
<case to="map-scores-to-dois">${resume eq "map-scores"}</case>
<case to="create-openaire-ranking-graph">${resume eq "start"}</case>
<!-- TODO: add action set creation here -->
<default to="create-openaire-ranking-graph" />
</switch>
</decision>
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name="create-openaire-ranking-graph">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- Delete previously created doi synonym folder -->
<prepare>
<delete path="${synonymFolder}"/>
</prepare>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Openaire Ranking Graph Creation</name>
<!-- Script name goes here -->
<jar>create_openaire_ranking_graph.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 20G --executor-cores 4 --driver-memory 20G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<!-- The openaire graph data from which to read relations and objects -->
<arg>${openaireDataInput}</arg>
<!-- Year for filtering entries w/ larger values / empty -->
<arg>${currentYear}</arg>
<!-- number of partitions to be used on joins -->
<arg>7680</arg>
<!-- The output of the graph should be the openaire input graph for ranking-->
<arg>${openaireGraphInputPath}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to="openaire-graph-error" />
</action>
<!-- Citation Count and RAM are calculated in parallel-->
<!-- Impulse Requires resources and will be run after-->
<fork name="non-iterative-rankings">
<path start="spark-cc"/>
<!-- <path start="spark-impulse"/> -->
<path start="spark-ram"/>
</fork>
<!-- CC here -->
<action name="spark-cc">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Spark CC</name>
<!-- Script name goes here -->
<jar>CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins -->
<arg>7680</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/CC.py#CC.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="join-non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to="cc-fail" />
</action>
<!-- IMPULSE here -->
<action name="spark-ram">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Spark RAM</name>
<!-- Script name goes here -->
<jar>TAR.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg>
<arg>${ramGamma}</arg>
<arg>${currentYear}</arg>
<arg>RAM</arg>
<!-- number of partitions to be used on joins -->
<arg>7680</arg>
<arg>${γιτ α}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/TAR.py#TAR.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="join-non-iterative-rankings" />
<!-- Go there if we have an error -->
<error to="ram-fail" />
</action>
<!-- JOIN NON-ITERATIVE METHODS AND THEN CONTINUE TO ITERATIVE ONES -->
<join name="join-non-iterative-rankings" to="spark-impulse"/>
<!-- IMPULSE here -->
<action name="spark-impulse">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Spark Impulse</name>
<!-- Script name goes here -->
<jar>CC.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg>
<!-- number of partitions to be used on joins -->
<arg>7680</arg>
<arg>3</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/CC.py#CC.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="iterative-rankings" />
<!-- Go there if we have an error -->
<error to="impulse-fail" />
</action>
<fork name="iterative-rankings">
<path start="spark-pagerank"/>
<path start="spark-attrank"/>
</fork>
<!-- PAGERANK here -->
<action name="spark-pagerank">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- we could add map-reduce configs here, but I don't know if we need them -->
<!-- This is the type of master-client configuration for running spark -->
<!-- <master>yarn-client</master> -->
<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn-cluster, yarn-master, or local. -->
<!-- <master>local[*]</master> -->
<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
<!-- <mode>client</mode> -->
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Spark Pagerank</name>
<!-- Script name goes here -->
<jar>PageRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg>
<arg>${pageRankAlpha}</arg>
<arg>${convergenceError}</arg>
<arg>${checkpointDir}</arg>
<!-- number of partitions to be used on joins -->
<arg>7680</arg>
<arg>dfs</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/PageRank.py#PageRank.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="join-iterative-rankings" />
<!-- Go there if we have an error -->
<error to="pagerank-fail" />
</action>
<!-- ATTRANK here -->
<action name="spark-attrank">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Spark AttRank</name>
<!-- Script name goes here -->
<jar>AttRank.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${openaireGraphInputPath}</arg>
<arg>${attrankAlpha}</arg>
<arg>${attrankBeta}</arg>
<arg>${attrankGamma}</arg>
<arg>${attrankRho}</arg>
<arg>${currentYear}</arg>
<arg>${attrankStartYear}</arg>
<arg>${convergenceError}</arg>
<arg>${checkpointDir}</arg>
<!-- number of partitions to be used on joins -->
<arg>7680</arg>
<arg>dfs</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/AttRank.py#AttRank.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="join-iterative-rankings" />
<!-- Go there if we have an error -->
<error to="attrank-fail" />
</action>
<!-- JOIN ITERATIVE METHODS AND THEN END -->
<join name="join-iterative-rankings" to="get-file-names"/>
<!-- This will be a shell action that will output key-value pairs for output files -->
<action name="get-file-names">
<!-- This is required as a tag for shell jobs -->
<shell xmlns="uri:oozie:shell-action:0.3">
<!-- Same for all -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs -->
<name-node>${nameNode}</name-node>
<!-- Exec is needed foor shell comands - points to type of shell command -->
<exec>/usr/bin/bash</exec>
<!-- name of script to run -->
<argument>get_ranking_files.sh</argument>
<!-- We only pass the directory where we expect to find the rankings -->
<argument>/${workflowDataDir}</argument>
<!-- the name of the file run -->
<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
<!-- Get the output in order to be usable by following actions -->
<capture-output/>
</shell>
<!-- Do this after finishing okay -->
<ok to="format-result-files" />
<!-- Go there if we have an error -->
<error to="filename-getting-error" />
</action>
<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
<fork name="format-result-files">
<path start="format-bip-files"/>
<path start="format-json-files"/>
</fork>
<!-- Format json files -->
<!-- Two parts: a) format files b) make the file endings .json.gz -->
<action name="format-json-files">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Format Ranking Results JSON</name>
<!-- Script name goes here -->
<jar>format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>json</arg>
<!-- Input files must be identified dynamically -->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions -->
<arg>7680</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
<arg>openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="join-file-formatting" />
<!-- Go there if we have an error -->
<error to="json-formatting-fail" />
</action>
<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
<action name="format-bip-files">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Format Ranking Results BiP! DB</name>
<!-- Script name goes here -->
<jar>format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>zenodo</arg>
<!-- Input files must be identified dynamically -->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions -->
<arg>7680</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
<arg>openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="join-file-formatting" />
<!-- Go there if we have an error -->
<error to="bip-formatting-fail" />
</action>
<!-- Finish formatting data and end -->
<join name="join-file-formatting" to="map-openaire-to-doi"/>
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name="map-openaire-to-doi">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- Delete previously created doi synonym folder -->
<prepare>
<delete path="${synonymFolder}"/>
</prepare>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Openaire-DOI synonym collection</name>
<!-- Script name goes here -->
<jar>map_openaire_ids_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${openaireDataInput}</arg>
<!-- number of partitions to be used on joins -->
<arg>${synonymFolder}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="map-scores-to-dois" />
<!-- Go there if we have an error -->
<error to="synonym-collection-fail" />
</action>
<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
<action name="map-scores-to-dois">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
<job-tracker>${jobTracker}</job-tracker>
<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
<name-node>${nameNode}</name-node>
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Mapping Openaire Scores to DOIs</name>
<!-- Script name goes here -->
<jar>map_scores_to_dois.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G
--master yarn
--deploy-mode cluster
--conf spark.sql.shuffle.partitions=7680
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
<!-- Script arguments here -->
<arg>${synonymFolder}</arg>
<!-- Number of partitions -->
<arg>7680</arg>
<!-- The remaining input are the ranking files fproduced for bip db-->
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
</spark>
<!-- Do this after finishing okay -->
<ok to="deleteOutputPathForActionSet" />
<!-- Go there if we have an error -->
<error to="map-scores-fail" />
</action>
<action name="deleteOutputPathForActionSet">
<fs>
<delete path="${actionSetOutputPath}"/>
<mkdir path="${actionSetOutputPath}"/>
<!--
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
-->
</fs>
<ok to="createActionSet"/>
<error to="actionset-delete-fail"/>
</action>
<action name="createActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for publications</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${bipScorePath}</arg>
<arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
</spark>
<ok to="end"/>
<error to="actionset-creation-fail"/>
</action>
<!-- TODO: end the workflow-->
<!-- Define ending node -->
<end name="end" />
<!-- Definitions of failure messages -->
<kill name="pagerank-fail">
<message>PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="attrank-fail">
<message>AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="cc-fail">
<message>CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="impulse-fail">
<message>Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="ram-fail">
<message>RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="openaire-graph-error">
<message>Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="synonym-collection-fail">
<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="map-scores-fail">
<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="actionset-delete-fail">
<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="actionset-creation-fail">
<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
</workflow-app>